From 1adb9c807a4031d3aefaa2e0fc286ca1de5274c0 Mon Sep 17 00:00:00 2001 From: haskal Date: Sat, 12 Jun 2021 04:37:32 -0400 Subject: [PATCH] work on full text search --- ext/Cargo.lock | 4 ++ ext/fulltext/Cargo.toml | 3 + ext/fulltext/src/lib.rs | 128 +++++++++++++++++---------------------- ext/fulltext/src/main.rs | 25 ++++++++ util.rkt | 27 ++++++++- 5 files changed, 112 insertions(+), 75 deletions(-) create mode 100644 ext/fulltext/src/main.rs diff --git a/ext/Cargo.lock b/ext/Cargo.lock index 24a9d3e..04244ea 100644 --- a/ext/Cargo.lock +++ b/ext/Cargo.lock @@ -98,6 +98,7 @@ dependencies = [ "libc", "num-integer", "num-traits", + "serde", "time", "winapi", ] @@ -255,6 +256,9 @@ dependencies = [ name = "fulltext" version = "0.1.0" dependencies = [ + "chrono", + "serde", + "serde-lexpr", "tantivy", ] diff --git a/ext/fulltext/Cargo.toml b/ext/fulltext/Cargo.toml index 2424584..f08fa90 100644 --- a/ext/fulltext/Cargo.toml +++ b/ext/fulltext/Cargo.toml @@ -8,3 +8,6 @@ edition = "2018" [dependencies] tantivy = "0.15.0" +serde = { version = "1.0", features = ["derive"] } +serde-lexpr = "0.1.2" +chrono = { version = "0.4", features = ["serde"] } diff --git a/ext/fulltext/src/lib.rs b/ext/fulltext/src/lib.rs index 900b1bd..d394082 100644 --- a/ext/fulltext/src/lib.rs +++ b/ext/fulltext/src/lib.rs @@ -1,86 +1,68 @@ +use std::result::Result; + +use chrono::prelude::*; +use serde::{Serialize, Deserialize}; #[macro_use] extern crate tantivy; - use tantivy::collector::TopDocs; use tantivy::query::QueryParser; -use tantivy::schema::*; -use tantivy::Index; -use tantivy::ReloadPolicy; +use tantivy::schema::{Schema, Facet, INDEXED, STORED, STRING, TEXT, FAST}; +use tantivy::{Document, Index, IndexWriter, ReloadPolicy}; -pub fn make_index() -> tantivy::Result<()> { +#[derive(Serialize, Deserialize, Debug)] +pub struct CapybaraPage { + path: String, + title: String, + date: DateTime, + summary: String, + tags: Vec, + authors: Vec, + body: String +} + +pub fn example_page() -> CapybaraPage { + CapybaraPage { + path: "/meow/meow2".into(), + title: "The Old Man and the Sea".into(), + date: Utc::now(), + summary: "meow meow meow".into(), + tags: vec!["tag1".into(), "WaterDrinkers".into()], + authors: vec!["haskal".into()], + body: "He was an old man who fished alone in a skiff in the Gulf Stream and he had gone eighty-four days now without taking a fish.".into() + } +} + +pub fn make_schema() -> Schema { let mut schema_builder = Schema::builder(); + schema_builder.add_text_field("xref_path", STRING | STORED); + schema_builder.add_facet_field("xref", INDEXED); schema_builder.add_text_field("title", TEXT | STORED); + schema_builder.add_date_field("date", FAST); + schema_builder.add_text_field("summary", TEXT | STORED); + schema_builder.add_text_field("tags", TEXT | STORED); + schema_builder.add_text_field("authors", TEXT | STORED); schema_builder.add_text_field("body", TEXT); - let schema = schema_builder.build(); + schema_builder.build() +} - let index = Index::create_in_ram(schema.clone()); - - let mut index_writer = index.writer(50_000_000)?; +pub fn add_document(schema: &Schema, index_writer: &mut IndexWriter, doc: &CapybaraPage) { + let xref_path = schema.get_field("xref_path").unwrap(); + let xref = schema.get_field("xref").unwrap(); let title = schema.get_field("title").unwrap(); + let date = schema.get_field("date").unwrap(); + let summary = schema.get_field("summary").unwrap(); + let tags = schema.get_field("tags").unwrap(); + let authors = schema.get_field("authors").unwrap(); let body = schema.get_field("body").unwrap(); - let mut old_man_doc = Document::default(); - old_man_doc.add_text(title, "The Old Man and the Sea"); - old_man_doc.add_text( - body, - "He was an old man who fished alone in a skiff in the Gulf Stream and \ - he had gone eighty-four days now without taking a fish.", - ); - - index_writer.add_document(old_man_doc); - - index_writer.add_document(doc!( - title => "Of Mice and Men", - body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \ - bank and runs deep and green. The water is warm too, for it has slipped twinkling \ - over the yellow sands in the sunlight before reaching the narrow pool. On one \ - side of the river the golden foothill slopes curve up to the strong and rocky \ - Gabilan Mountains, but on the valley side the water is lined with trees—willows \ - fresh and green with every spring, carrying in their lower leaf junctures the \ - debris of the winter’s flooding; and sycamores with mottled, white, recumbent \ - limbs and branches that arch over the pool" - )); - - index_writer.add_document(doc!( - title => "Of Mice and Men", - body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \ - bank and runs deep and green. The water is warm too, for it has slipped twinkling \ - over the yellow sands in the sunlight before reaching the narrow pool. On one \ - side of the river the golden foothill slopes curve up to the strong and rocky \ - Gabilan Mountains, but on the valley side the water is lined with trees—willows \ - fresh and green with every spring, carrying in their lower leaf junctures the \ - debris of the winter’s flooding; and sycamores with mottled, white, recumbent \ - limbs and branches that arch over the pool" - )); - - index_writer.add_document(doc!( - title => "Frankenstein", - title => "The Modern Prometheus", - body => "You will rejoice to hear that no disaster has accompanied the commencement of an \ - enterprise which you have regarded with such evil forebodings. I arrived here \ - yesterday, and my first task is to assure my dear sister of my welfare and \ - increasing confidence in the success of my undertaking." - )); - - index_writer.commit()?; - - let reader = index - .reader_builder() - .reload_policy(ReloadPolicy::OnCommit) - .try_into()?; - - let searcher = reader.searcher(); - - let query_parser = QueryParser::for_index(&index, vec![title, body]); - - let query = query_parser.parse_query("sea whale")?; - - let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?; - - for (_score, doc_address) in top_docs { - let retrieved_doc = searcher.doc(doc_address)?; - println!("{}", schema.to_json(&retrieved_doc)); - } - - Ok(()) + let mut out_doc = Document::default(); + out_doc.add_text(xref_path, &doc.path); + out_doc.add_facet(xref, Facet::from(&doc.path)); + out_doc.add_text(title, &doc.title); + out_doc.add_date(date, &doc.date); + out_doc.add_text(summary, &doc.summary); + out_doc.add_text(tags, doc.tags.join(" ")); + out_doc.add_text(authors, doc.authors.join(" ")); + out_doc.add_text(body, &doc.body); + index_writer.add_document(out_doc); } diff --git a/ext/fulltext/src/main.rs b/ext/fulltext/src/main.rs new file mode 100644 index 0000000..c165469 --- /dev/null +++ b/ext/fulltext/src/main.rs @@ -0,0 +1,25 @@ +pub fn main() { + let schema = fulltext::make_schema(); + let page = fulltext::example_page(); + + let index = tantivy::Index::create_in_dir("/tmp/tantivy-test", schema.clone()).unwrap(); + let mut index_writer = index.writer(50_000_000).unwrap(); + + fulltext::add_document(&schema, &mut index_writer, &page); + index_writer.commit().unwrap(); + + let reader = index.reader_builder().reload_policy(tantivy::ReloadPolicy::OnCommit) + .try_into().unwrap(); + let searcher = reader.searcher(); + let query_parser = tantivy::query::QueryParser::for_index(&index, + vec![schema.get_field("title").unwrap(), schema.get_field("body").unwrap()]); + + let query = query_parser.parse_query("man fish").unwrap(); + let top_docs = searcher.search(&query, &tantivy::collector::TopDocs::with_limit(10)).unwrap(); + println!("top docs"); + for (_score, doc_address) in top_docs { + let retrieved_doc = searcher.doc(doc_address).unwrap(); + println!("got doc: {}", schema.to_json(&retrieved_doc)); + } + println!("end of docs"); +} diff --git a/util.rkt b/util.rkt index 9844032..d01c02b 100644 --- a/util.rkt +++ b/util.rkt @@ -1,13 +1,36 @@ #lang racket/base -(require racket/date racket/format racket/string racket/port) +(require racket/date racket/format racket/match racket/string racket/port) -(provide ~r/pad run-external get-date-ymd) +(provide ~r/pad run-external get-date-ymd date->string/iso-8601) (define (get-date-ymd) (define date (current-date)) (list (date-year date) (date-month date) (date-day date))) +;; (date-display-format 'iso-8601) fails to take into account the time zone offset +;; like at all +;; idk why this is +;; anyway since i don't want to just append that to the end of date->string in case in a future +;; version of racket this actually changes, this implements a "true" iso-8601 function from scratch +(define (date->string/iso-8601 d) + (match d + [(date second minute hour day month year _ _ _ time-zone-offset) + (define offset* (abs (quotient time-zone-offset 60))) + (define-values [offset-hours offset-minutes] + (quotient/remainder offset* 60)) + (format "~a-~a-~aT~a:~a:~a~a~a:~a" + (~r/pad year 4) + (~r/pad month 2) + (~r/pad day 2) + (~r/pad hour 2) + (~r/pad minute 2) + (~r/pad second 2) + (if (negative? time-zone-offset) "-" "+") + (~r/pad offset-hours 2) + (~r/pad offset-minutes 2))])) + + (define (~r/pad num pad-to) (~r num #:min-width pad-to #:pad-string "0"))