work on full text search
This commit is contained in:
parent
d51e965908
commit
1adb9c807a
|
@ -98,6 +98,7 @@ dependencies = [
|
||||||
"libc",
|
"libc",
|
||||||
"num-integer",
|
"num-integer",
|
||||||
"num-traits",
|
"num-traits",
|
||||||
|
"serde",
|
||||||
"time",
|
"time",
|
||||||
"winapi",
|
"winapi",
|
||||||
]
|
]
|
||||||
|
@ -255,6 +256,9 @@ dependencies = [
|
||||||
name = "fulltext"
|
name = "fulltext"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"chrono",
|
||||||
|
"serde",
|
||||||
|
"serde-lexpr",
|
||||||
"tantivy",
|
"tantivy",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
|
@ -8,3 +8,6 @@ edition = "2018"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
tantivy = "0.15.0"
|
tantivy = "0.15.0"
|
||||||
|
serde = { version = "1.0", features = ["derive"] }
|
||||||
|
serde-lexpr = "0.1.2"
|
||||||
|
chrono = { version = "0.4", features = ["serde"] }
|
||||||
|
|
|
@ -1,86 +1,68 @@
|
||||||
|
use std::result::Result;
|
||||||
|
|
||||||
|
use chrono::prelude::*;
|
||||||
|
use serde::{Serialize, Deserialize};
|
||||||
#[macro_use]
|
#[macro_use]
|
||||||
extern crate tantivy;
|
extern crate tantivy;
|
||||||
|
|
||||||
use tantivy::collector::TopDocs;
|
use tantivy::collector::TopDocs;
|
||||||
use tantivy::query::QueryParser;
|
use tantivy::query::QueryParser;
|
||||||
use tantivy::schema::*;
|
use tantivy::schema::{Schema, Facet, INDEXED, STORED, STRING, TEXT, FAST};
|
||||||
use tantivy::Index;
|
use tantivy::{Document, Index, IndexWriter, ReloadPolicy};
|
||||||
use tantivy::ReloadPolicy;
|
|
||||||
|
|
||||||
pub fn make_index() -> tantivy::Result<()> {
|
#[derive(Serialize, Deserialize, Debug)]
|
||||||
|
pub struct CapybaraPage {
|
||||||
|
path: String,
|
||||||
|
title: String,
|
||||||
|
date: DateTime<Utc>,
|
||||||
|
summary: String,
|
||||||
|
tags: Vec<String>,
|
||||||
|
authors: Vec<String>,
|
||||||
|
body: String
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn example_page() -> CapybaraPage {
|
||||||
|
CapybaraPage {
|
||||||
|
path: "/meow/meow2".into(),
|
||||||
|
title: "The Old Man and the Sea".into(),
|
||||||
|
date: Utc::now(),
|
||||||
|
summary: "meow meow meow".into(),
|
||||||
|
tags: vec!["tag1".into(), "WaterDrinkers".into()],
|
||||||
|
authors: vec!["haskal".into()],
|
||||||
|
body: "He was an old man who fished alone in a skiff in the Gulf Stream and he had gone eighty-four days now without taking a fish.".into()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn make_schema() -> Schema {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
|
schema_builder.add_text_field("xref_path", STRING | STORED);
|
||||||
|
schema_builder.add_facet_field("xref", INDEXED);
|
||||||
schema_builder.add_text_field("title", TEXT | STORED);
|
schema_builder.add_text_field("title", TEXT | STORED);
|
||||||
|
schema_builder.add_date_field("date", FAST);
|
||||||
|
schema_builder.add_text_field("summary", TEXT | STORED);
|
||||||
|
schema_builder.add_text_field("tags", TEXT | STORED);
|
||||||
|
schema_builder.add_text_field("authors", TEXT | STORED);
|
||||||
schema_builder.add_text_field("body", TEXT);
|
schema_builder.add_text_field("body", TEXT);
|
||||||
let schema = schema_builder.build();
|
schema_builder.build()
|
||||||
|
}
|
||||||
|
|
||||||
let index = Index::create_in_ram(schema.clone());
|
pub fn add_document(schema: &Schema, index_writer: &mut IndexWriter, doc: &CapybaraPage) {
|
||||||
|
let xref_path = schema.get_field("xref_path").unwrap();
|
||||||
let mut index_writer = index.writer(50_000_000)?;
|
let xref = schema.get_field("xref").unwrap();
|
||||||
let title = schema.get_field("title").unwrap();
|
let title = schema.get_field("title").unwrap();
|
||||||
|
let date = schema.get_field("date").unwrap();
|
||||||
|
let summary = schema.get_field("summary").unwrap();
|
||||||
|
let tags = schema.get_field("tags").unwrap();
|
||||||
|
let authors = schema.get_field("authors").unwrap();
|
||||||
let body = schema.get_field("body").unwrap();
|
let body = schema.get_field("body").unwrap();
|
||||||
|
|
||||||
let mut old_man_doc = Document::default();
|
let mut out_doc = Document::default();
|
||||||
old_man_doc.add_text(title, "The Old Man and the Sea");
|
out_doc.add_text(xref_path, &doc.path);
|
||||||
old_man_doc.add_text(
|
out_doc.add_facet(xref, Facet::from(&doc.path));
|
||||||
body,
|
out_doc.add_text(title, &doc.title);
|
||||||
"He was an old man who fished alone in a skiff in the Gulf Stream and \
|
out_doc.add_date(date, &doc.date);
|
||||||
he had gone eighty-four days now without taking a fish.",
|
out_doc.add_text(summary, &doc.summary);
|
||||||
);
|
out_doc.add_text(tags, doc.tags.join(" "));
|
||||||
|
out_doc.add_text(authors, doc.authors.join(" "));
|
||||||
index_writer.add_document(old_man_doc);
|
out_doc.add_text(body, &doc.body);
|
||||||
|
index_writer.add_document(out_doc);
|
||||||
index_writer.add_document(doc!(
|
|
||||||
title => "Of Mice and Men",
|
|
||||||
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
|
||||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
|
||||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
|
||||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
|
||||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
|
||||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
|
||||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
|
||||||
limbs and branches that arch over the pool"
|
|
||||||
));
|
|
||||||
|
|
||||||
index_writer.add_document(doc!(
|
|
||||||
title => "Of Mice and Men",
|
|
||||||
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
|
||||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
|
||||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
|
||||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
|
||||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
|
||||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
|
||||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
|
||||||
limbs and branches that arch over the pool"
|
|
||||||
));
|
|
||||||
|
|
||||||
index_writer.add_document(doc!(
|
|
||||||
title => "Frankenstein",
|
|
||||||
title => "The Modern Prometheus",
|
|
||||||
body => "You will rejoice to hear that no disaster has accompanied the commencement of an \
|
|
||||||
enterprise which you have regarded with such evil forebodings. I arrived here \
|
|
||||||
yesterday, and my first task is to assure my dear sister of my welfare and \
|
|
||||||
increasing confidence in the success of my undertaking."
|
|
||||||
));
|
|
||||||
|
|
||||||
index_writer.commit()?;
|
|
||||||
|
|
||||||
let reader = index
|
|
||||||
.reader_builder()
|
|
||||||
.reload_policy(ReloadPolicy::OnCommit)
|
|
||||||
.try_into()?;
|
|
||||||
|
|
||||||
let searcher = reader.searcher();
|
|
||||||
|
|
||||||
let query_parser = QueryParser::for_index(&index, vec![title, body]);
|
|
||||||
|
|
||||||
let query = query_parser.parse_query("sea whale")?;
|
|
||||||
|
|
||||||
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
|
|
||||||
|
|
||||||
for (_score, doc_address) in top_docs {
|
|
||||||
let retrieved_doc = searcher.doc(doc_address)?;
|
|
||||||
println!("{}", schema.to_json(&retrieved_doc));
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,25 @@
|
||||||
|
pub fn main() {
|
||||||
|
let schema = fulltext::make_schema();
|
||||||
|
let page = fulltext::example_page();
|
||||||
|
|
||||||
|
let index = tantivy::Index::create_in_dir("/tmp/tantivy-test", schema.clone()).unwrap();
|
||||||
|
let mut index_writer = index.writer(50_000_000).unwrap();
|
||||||
|
|
||||||
|
fulltext::add_document(&schema, &mut index_writer, &page);
|
||||||
|
index_writer.commit().unwrap();
|
||||||
|
|
||||||
|
let reader = index.reader_builder().reload_policy(tantivy::ReloadPolicy::OnCommit)
|
||||||
|
.try_into().unwrap();
|
||||||
|
let searcher = reader.searcher();
|
||||||
|
let query_parser = tantivy::query::QueryParser::for_index(&index,
|
||||||
|
vec![schema.get_field("title").unwrap(), schema.get_field("body").unwrap()]);
|
||||||
|
|
||||||
|
let query = query_parser.parse_query("man fish").unwrap();
|
||||||
|
let top_docs = searcher.search(&query, &tantivy::collector::TopDocs::with_limit(10)).unwrap();
|
||||||
|
println!("top docs");
|
||||||
|
for (_score, doc_address) in top_docs {
|
||||||
|
let retrieved_doc = searcher.doc(doc_address).unwrap();
|
||||||
|
println!("got doc: {}", schema.to_json(&retrieved_doc));
|
||||||
|
}
|
||||||
|
println!("end of docs");
|
||||||
|
}
|
27
util.rkt
27
util.rkt
|
@ -1,13 +1,36 @@
|
||||||
#lang racket/base
|
#lang racket/base
|
||||||
|
|
||||||
(require racket/date racket/format racket/string racket/port)
|
(require racket/date racket/format racket/match racket/string racket/port)
|
||||||
|
|
||||||
(provide ~r/pad run-external get-date-ymd)
|
(provide ~r/pad run-external get-date-ymd date->string/iso-8601)
|
||||||
|
|
||||||
(define (get-date-ymd)
|
(define (get-date-ymd)
|
||||||
(define date (current-date))
|
(define date (current-date))
|
||||||
(list (date-year date) (date-month date) (date-day date)))
|
(list (date-year date) (date-month date) (date-day date)))
|
||||||
|
|
||||||
|
;; (date-display-format 'iso-8601) fails to take into account the time zone offset
|
||||||
|
;; like at all
|
||||||
|
;; idk why this is
|
||||||
|
;; anyway since i don't want to just append that to the end of date->string in case in a future
|
||||||
|
;; version of racket this actually changes, this implements a "true" iso-8601 function from scratch
|
||||||
|
(define (date->string/iso-8601 d)
|
||||||
|
(match d
|
||||||
|
[(date second minute hour day month year _ _ _ time-zone-offset)
|
||||||
|
(define offset* (abs (quotient time-zone-offset 60)))
|
||||||
|
(define-values [offset-hours offset-minutes]
|
||||||
|
(quotient/remainder offset* 60))
|
||||||
|
(format "~a-~a-~aT~a:~a:~a~a~a:~a"
|
||||||
|
(~r/pad year 4)
|
||||||
|
(~r/pad month 2)
|
||||||
|
(~r/pad day 2)
|
||||||
|
(~r/pad hour 2)
|
||||||
|
(~r/pad minute 2)
|
||||||
|
(~r/pad second 2)
|
||||||
|
(if (negative? time-zone-offset) "-" "+")
|
||||||
|
(~r/pad offset-hours 2)
|
||||||
|
(~r/pad offset-minutes 2))]))
|
||||||
|
|
||||||
|
|
||||||
(define (~r/pad num pad-to)
|
(define (~r/pad num pad-to)
|
||||||
(~r num #:min-width pad-to #:pad-string "0"))
|
(~r num #:min-width pad-to #:pad-string "0"))
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue