work on full text search

This commit is contained in:
xenia 2021-06-12 04:37:32 -04:00
parent d51e965908
commit 1adb9c807a
5 changed files with 112 additions and 75 deletions

4
ext/Cargo.lock generated
View File

@ -98,6 +98,7 @@ dependencies = [
"libc",
"num-integer",
"num-traits",
"serde",
"time",
"winapi",
]
@ -255,6 +256,9 @@ dependencies = [
name = "fulltext"
version = "0.1.0"
dependencies = [
"chrono",
"serde",
"serde-lexpr",
"tantivy",
]

View File

@ -8,3 +8,6 @@ edition = "2018"
[dependencies]
tantivy = "0.15.0"
serde = { version = "1.0", features = ["derive"] }
serde-lexpr = "0.1.2"
chrono = { version = "0.4", features = ["serde"] }

View File

@ -1,86 +1,68 @@
use std::result::Result;
use chrono::prelude::*;
use serde::{Serialize, Deserialize};
#[macro_use]
extern crate tantivy;
use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::*;
use tantivy::Index;
use tantivy::ReloadPolicy;
use tantivy::schema::{Schema, Facet, INDEXED, STORED, STRING, TEXT, FAST};
use tantivy::{Document, Index, IndexWriter, ReloadPolicy};
pub fn make_index() -> tantivy::Result<()> {
#[derive(Serialize, Deserialize, Debug)]
pub struct CapybaraPage {
path: String,
title: String,
date: DateTime<Utc>,
summary: String,
tags: Vec<String>,
authors: Vec<String>,
body: String
}
pub fn example_page() -> CapybaraPage {
CapybaraPage {
path: "/meow/meow2".into(),
title: "The Old Man and the Sea".into(),
date: Utc::now(),
summary: "meow meow meow".into(),
tags: vec!["tag1".into(), "WaterDrinkers".into()],
authors: vec!["haskal".into()],
body: "He was an old man who fished alone in a skiff in the Gulf Stream and he had gone eighty-four days now without taking a fish.".into()
}
}
pub fn make_schema() -> Schema {
let mut schema_builder = Schema::builder();
schema_builder.add_text_field("xref_path", STRING | STORED);
schema_builder.add_facet_field("xref", INDEXED);
schema_builder.add_text_field("title", TEXT | STORED);
schema_builder.add_date_field("date", FAST);
schema_builder.add_text_field("summary", TEXT | STORED);
schema_builder.add_text_field("tags", TEXT | STORED);
schema_builder.add_text_field("authors", TEXT | STORED);
schema_builder.add_text_field("body", TEXT);
let schema = schema_builder.build();
schema_builder.build()
}
let index = Index::create_in_ram(schema.clone());
let mut index_writer = index.writer(50_000_000)?;
pub fn add_document(schema: &Schema, index_writer: &mut IndexWriter, doc: &CapybaraPage) {
let xref_path = schema.get_field("xref_path").unwrap();
let xref = schema.get_field("xref").unwrap();
let title = schema.get_field("title").unwrap();
let date = schema.get_field("date").unwrap();
let summary = schema.get_field("summary").unwrap();
let tags = schema.get_field("tags").unwrap();
let authors = schema.get_field("authors").unwrap();
let body = schema.get_field("body").unwrap();
let mut old_man_doc = Document::default();
old_man_doc.add_text(title, "The Old Man and the Sea");
old_man_doc.add_text(
body,
"He was an old man who fished alone in a skiff in the Gulf Stream and \
he had gone eighty-four days now without taking a fish.",
);
index_writer.add_document(old_man_doc);
index_writer.add_document(doc!(
title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with treeswillows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
));
index_writer.add_document(doc!(
title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with treeswillows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
));
index_writer.add_document(doc!(
title => "Frankenstein",
title => "The Modern Prometheus",
body => "You will rejoice to hear that no disaster has accompanied the commencement of an \
enterprise which you have regarded with such evil forebodings. I arrived here \
yesterday, and my first task is to assure my dear sister of my welfare and \
increasing confidence in the success of my undertaking."
));
index_writer.commit()?;
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommit)
.try_into()?;
let searcher = reader.searcher();
let query_parser = QueryParser::for_index(&index, vec![title, body]);
let query = query_parser.parse_query("sea whale")?;
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
for (_score, doc_address) in top_docs {
let retrieved_doc = searcher.doc(doc_address)?;
println!("{}", schema.to_json(&retrieved_doc));
}
Ok(())
let mut out_doc = Document::default();
out_doc.add_text(xref_path, &doc.path);
out_doc.add_facet(xref, Facet::from(&doc.path));
out_doc.add_text(title, &doc.title);
out_doc.add_date(date, &doc.date);
out_doc.add_text(summary, &doc.summary);
out_doc.add_text(tags, doc.tags.join(" "));
out_doc.add_text(authors, doc.authors.join(" "));
out_doc.add_text(body, &doc.body);
index_writer.add_document(out_doc);
}

25
ext/fulltext/src/main.rs Normal file
View File

@ -0,0 +1,25 @@
pub fn main() {
let schema = fulltext::make_schema();
let page = fulltext::example_page();
let index = tantivy::Index::create_in_dir("/tmp/tantivy-test", schema.clone()).unwrap();
let mut index_writer = index.writer(50_000_000).unwrap();
fulltext::add_document(&schema, &mut index_writer, &page);
index_writer.commit().unwrap();
let reader = index.reader_builder().reload_policy(tantivy::ReloadPolicy::OnCommit)
.try_into().unwrap();
let searcher = reader.searcher();
let query_parser = tantivy::query::QueryParser::for_index(&index,
vec![schema.get_field("title").unwrap(), schema.get_field("body").unwrap()]);
let query = query_parser.parse_query("man fish").unwrap();
let top_docs = searcher.search(&query, &tantivy::collector::TopDocs::with_limit(10)).unwrap();
println!("top docs");
for (_score, doc_address) in top_docs {
let retrieved_doc = searcher.doc(doc_address).unwrap();
println!("got doc: {}", schema.to_json(&retrieved_doc));
}
println!("end of docs");
}

View File

@ -1,13 +1,36 @@
#lang racket/base
(require racket/date racket/format racket/string racket/port)
(require racket/date racket/format racket/match racket/string racket/port)
(provide ~r/pad run-external get-date-ymd)
(provide ~r/pad run-external get-date-ymd date->string/iso-8601)
(define (get-date-ymd)
(define date (current-date))
(list (date-year date) (date-month date) (date-day date)))
;; (date-display-format 'iso-8601) fails to take into account the time zone offset
;; like at all
;; idk why this is
;; anyway since i don't want to just append that to the end of date->string in case in a future
;; version of racket this actually changes, this implements a "true" iso-8601 function from scratch
(define (date->string/iso-8601 d)
(match d
[(date second minute hour day month year _ _ _ time-zone-offset)
(define offset* (abs (quotient time-zone-offset 60)))
(define-values [offset-hours offset-minutes]
(quotient/remainder offset* 60))
(format "~a-~a-~aT~a:~a:~a~a~a:~a"
(~r/pad year 4)
(~r/pad month 2)
(~r/pad day 2)
(~r/pad hour 2)
(~r/pad minute 2)
(~r/pad second 2)
(if (negative? time-zone-offset) "-" "+")
(~r/pad offset-hours 2)
(~r/pad offset-minutes 2))]))
(define (~r/pad num pad-to)
(~r num #:min-width pad-to #:pad-string "0"))