work on full text search
This commit is contained in:
parent
d51e965908
commit
1adb9c807a
|
@ -98,6 +98,7 @@ dependencies = [
|
|||
"libc",
|
||||
"num-integer",
|
||||
"num-traits",
|
||||
"serde",
|
||||
"time",
|
||||
"winapi",
|
||||
]
|
||||
|
@ -255,6 +256,9 @@ dependencies = [
|
|||
name = "fulltext"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"chrono",
|
||||
"serde",
|
||||
"serde-lexpr",
|
||||
"tantivy",
|
||||
]
|
||||
|
||||
|
|
|
@ -8,3 +8,6 @@ edition = "2018"
|
|||
|
||||
[dependencies]
|
||||
tantivy = "0.15.0"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde-lexpr = "0.1.2"
|
||||
chrono = { version = "0.4", features = ["serde"] }
|
||||
|
|
|
@ -1,86 +1,68 @@
|
|||
use std::result::Result;
|
||||
|
||||
use chrono::prelude::*;
|
||||
use serde::{Serialize, Deserialize};
|
||||
#[macro_use]
|
||||
extern crate tantivy;
|
||||
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::Index;
|
||||
use tantivy::ReloadPolicy;
|
||||
use tantivy::schema::{Schema, Facet, INDEXED, STORED, STRING, TEXT, FAST};
|
||||
use tantivy::{Document, Index, IndexWriter, ReloadPolicy};
|
||||
|
||||
pub fn make_index() -> tantivy::Result<()> {
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct CapybaraPage {
|
||||
path: String,
|
||||
title: String,
|
||||
date: DateTime<Utc>,
|
||||
summary: String,
|
||||
tags: Vec<String>,
|
||||
authors: Vec<String>,
|
||||
body: String
|
||||
}
|
||||
|
||||
pub fn example_page() -> CapybaraPage {
|
||||
CapybaraPage {
|
||||
path: "/meow/meow2".into(),
|
||||
title: "The Old Man and the Sea".into(),
|
||||
date: Utc::now(),
|
||||
summary: "meow meow meow".into(),
|
||||
tags: vec!["tag1".into(), "WaterDrinkers".into()],
|
||||
authors: vec!["haskal".into()],
|
||||
body: "He was an old man who fished alone in a skiff in the Gulf Stream and he had gone eighty-four days now without taking a fish.".into()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn make_schema() -> Schema {
|
||||
let mut schema_builder = Schema::builder();
|
||||
schema_builder.add_text_field("xref_path", STRING | STORED);
|
||||
schema_builder.add_facet_field("xref", INDEXED);
|
||||
schema_builder.add_text_field("title", TEXT | STORED);
|
||||
schema_builder.add_date_field("date", FAST);
|
||||
schema_builder.add_text_field("summary", TEXT | STORED);
|
||||
schema_builder.add_text_field("tags", TEXT | STORED);
|
||||
schema_builder.add_text_field("authors", TEXT | STORED);
|
||||
schema_builder.add_text_field("body", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
schema_builder.build()
|
||||
}
|
||||
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
pub fn add_document(schema: &Schema, index_writer: &mut IndexWriter, doc: &CapybaraPage) {
|
||||
let xref_path = schema.get_field("xref_path").unwrap();
|
||||
let xref = schema.get_field("xref").unwrap();
|
||||
let title = schema.get_field("title").unwrap();
|
||||
let date = schema.get_field("date").unwrap();
|
||||
let summary = schema.get_field("summary").unwrap();
|
||||
let tags = schema.get_field("tags").unwrap();
|
||||
let authors = schema.get_field("authors").unwrap();
|
||||
let body = schema.get_field("body").unwrap();
|
||||
|
||||
let mut old_man_doc = Document::default();
|
||||
old_man_doc.add_text(title, "The Old Man and the Sea");
|
||||
old_man_doc.add_text(
|
||||
body,
|
||||
"He was an old man who fished alone in a skiff in the Gulf Stream and \
|
||||
he had gone eighty-four days now without taking a fish.",
|
||||
);
|
||||
|
||||
index_writer.add_document(old_man_doc);
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
title => "Of Mice and Men",
|
||||
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"
|
||||
));
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
title => "Of Mice and Men",
|
||||
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"
|
||||
));
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
title => "Frankenstein",
|
||||
title => "The Modern Prometheus",
|
||||
body => "You will rejoice to hear that no disaster has accompanied the commencement of an \
|
||||
enterprise which you have regarded with such evil forebodings. I arrived here \
|
||||
yesterday, and my first task is to assure my dear sister of my welfare and \
|
||||
increasing confidence in the success of my undertaking."
|
||||
));
|
||||
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::OnCommit)
|
||||
.try_into()?;
|
||||
|
||||
let searcher = reader.searcher();
|
||||
|
||||
let query_parser = QueryParser::for_index(&index, vec![title, body]);
|
||||
|
||||
let query = query_parser.parse_query("sea whale")?;
|
||||
|
||||
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
|
||||
|
||||
for (_score, doc_address) in top_docs {
|
||||
let retrieved_doc = searcher.doc(doc_address)?;
|
||||
println!("{}", schema.to_json(&retrieved_doc));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
let mut out_doc = Document::default();
|
||||
out_doc.add_text(xref_path, &doc.path);
|
||||
out_doc.add_facet(xref, Facet::from(&doc.path));
|
||||
out_doc.add_text(title, &doc.title);
|
||||
out_doc.add_date(date, &doc.date);
|
||||
out_doc.add_text(summary, &doc.summary);
|
||||
out_doc.add_text(tags, doc.tags.join(" "));
|
||||
out_doc.add_text(authors, doc.authors.join(" "));
|
||||
out_doc.add_text(body, &doc.body);
|
||||
index_writer.add_document(out_doc);
|
||||
}
|
||||
|
|
|
@ -0,0 +1,25 @@
|
|||
pub fn main() {
|
||||
let schema = fulltext::make_schema();
|
||||
let page = fulltext::example_page();
|
||||
|
||||
let index = tantivy::Index::create_in_dir("/tmp/tantivy-test", schema.clone()).unwrap();
|
||||
let mut index_writer = index.writer(50_000_000).unwrap();
|
||||
|
||||
fulltext::add_document(&schema, &mut index_writer, &page);
|
||||
index_writer.commit().unwrap();
|
||||
|
||||
let reader = index.reader_builder().reload_policy(tantivy::ReloadPolicy::OnCommit)
|
||||
.try_into().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let query_parser = tantivy::query::QueryParser::for_index(&index,
|
||||
vec![schema.get_field("title").unwrap(), schema.get_field("body").unwrap()]);
|
||||
|
||||
let query = query_parser.parse_query("man fish").unwrap();
|
||||
let top_docs = searcher.search(&query, &tantivy::collector::TopDocs::with_limit(10)).unwrap();
|
||||
println!("top docs");
|
||||
for (_score, doc_address) in top_docs {
|
||||
let retrieved_doc = searcher.doc(doc_address).unwrap();
|
||||
println!("got doc: {}", schema.to_json(&retrieved_doc));
|
||||
}
|
||||
println!("end of docs");
|
||||
}
|
27
util.rkt
27
util.rkt
|
@ -1,13 +1,36 @@
|
|||
#lang racket/base
|
||||
|
||||
(require racket/date racket/format racket/string racket/port)
|
||||
(require racket/date racket/format racket/match racket/string racket/port)
|
||||
|
||||
(provide ~r/pad run-external get-date-ymd)
|
||||
(provide ~r/pad run-external get-date-ymd date->string/iso-8601)
|
||||
|
||||
(define (get-date-ymd)
|
||||
(define date (current-date))
|
||||
(list (date-year date) (date-month date) (date-day date)))
|
||||
|
||||
;; (date-display-format 'iso-8601) fails to take into account the time zone offset
|
||||
;; like at all
|
||||
;; idk why this is
|
||||
;; anyway since i don't want to just append that to the end of date->string in case in a future
|
||||
;; version of racket this actually changes, this implements a "true" iso-8601 function from scratch
|
||||
(define (date->string/iso-8601 d)
|
||||
(match d
|
||||
[(date second minute hour day month year _ _ _ time-zone-offset)
|
||||
(define offset* (abs (quotient time-zone-offset 60)))
|
||||
(define-values [offset-hours offset-minutes]
|
||||
(quotient/remainder offset* 60))
|
||||
(format "~a-~a-~aT~a:~a:~a~a~a:~a"
|
||||
(~r/pad year 4)
|
||||
(~r/pad month 2)
|
||||
(~r/pad day 2)
|
||||
(~r/pad hour 2)
|
||||
(~r/pad minute 2)
|
||||
(~r/pad second 2)
|
||||
(if (negative? time-zone-offset) "-" "+")
|
||||
(~r/pad offset-hours 2)
|
||||
(~r/pad offset-minutes 2))]))
|
||||
|
||||
|
||||
(define (~r/pad num pad-to)
|
||||
(~r num #:min-width pad-to #:pad-string "0"))
|
||||
|
||||
|
|
Loading…
Reference in New Issue