Usage Example#
For the comprehensive guide of the query language, read this: https://whoosh.readthedocs.io/en/latest/querylang.html
Import sayt
#
[1]:
from sayt.api import (
DataSet,
IdField,
TextField,
NumericField,
NgramField,
NgramWordsField,
StoredField,
T_DOCUMENT,
T_DOWNLOADER,
)
[2]:
import typing as T
import os
import time
import uuid
import random
from pathlib import Path
import faker
from fixa.timer import DateTimeTimer
from diskcache import Cache
from rich import print as rprint
[3]:
dir_here = Path(os.getcwd())
fake = faker.Faker()
Define your dataset schema#
Letβs say our dataset is about a details of a book.
{
"id": "id-1234",
"title": "Sustainable Energy - without the hot air",
"author": "MacKay, David JC",
"year": 2009,
}
We want to match
id
only if the query match the id exactly.We want to match
title
when words in the query match the word in the title, case-insensitive.We want to match
author
when any ngram characters in the query match the author name.We want to use range query to filter on
year
.
[4]:
fields = [
# unique ID field
IdField(name="id", stored=True),
# match by token (word) or phrase
TextField(name="title", stored=True),
# match by n-gram characters
NgramField(
name="author",
stored=True,
minsize=2,
maxsize=6,
),
# range query
NumericField(
name="year",
stored=True,
sortable=True,
ascending=False,
),
# range query
StoredField(
name="raw",
),
]
Define the downloader function#
Downloader is a function with zero arguments that returns a list of searchable documents
[5]:
def downloader():
return [
{
"id": "id-1234",
"title": "Sustainable Energy - without the hot air",
"author": "MacKay, David JC",
"year": 2009,
},
]
Define the Dataset Object#
Dataset is an abstraction of a searchable dataset. It defines how you want to index and search your dataset, how to download your dataset, and where to store the index and cache.
[6]:
ds = DataSet(
dir_index=dir_here.joinpath(".index"), # where the index locate
index_name="my-dataset", # unique name of your dataset
fields=fields,
cache=Cache(str(dir_here.joinpath(".cache")), tag_index=True), # where the cache locate
cache_key="my-dataset", # unique cache key for your dataset
cache_expire=10, # how long cache expire (in seconds)
cache_tag="my-dataset", # a tag can be used to do batch delete. if you want to delete cache for many dataset, give them the same tag
downloader=downloader,
)
ds.remove_all_index() # reset everything before testing
ds.remove_all_cache() # reset everything before testing
Play with the search method#
DataSet.search
method is the main API that performs the search, handles the caching, dataset refreshing and all the details.
[7]:
def run_query(query, limit: int=5, simple_response: bool=True):
res = ds.search(query, limit=limit, simple_response=simple_response)
rprint(res)
Multi Field Match#
By default, sayt
try to match the query in all searable fields.
[8]:
run_query("id-1234")
[ { 'author': 'MacKay, David JC', 'id': 'id-1234', 'title': 'Sustainable Energy - without the hot air', 'year': 2009 } ]
[9]:
run_query("energy")
[ { 'author': 'MacKay, David JC', 'id': 'id-1234', 'title': 'Sustainable Energy - without the hot air', 'year': 2009 } ]
[10]:
run_query("dav")
[ { 'author': 'MacKay, David JC', 'id': 'id-1234', 'title': 'Sustainable Energy - without the hot air', 'year': 2009 } ]
[11]:
run_query("2009")
[ { 'author': 'MacKay, David JC', 'id': 'id-1234', 'title': 'Sustainable Energy - without the hot air', 'year': 2009 } ]
Specify the Field you want to match#
You can use the ${field_name}:${query}
syntax to search on specific field.
[12]:
run_query("id:id-1234")
[ { 'author': 'MacKay, David JC', 'id': 'id-1234', 'title': 'Sustainable Energy - without the hot air', 'year': 2009 } ]
[13]:
run_query("title:energy")
[ { 'author': 'MacKay, David JC', 'id': 'id-1234', 'title': 'Sustainable Energy - without the hot air', 'year': 2009 } ]
[14]:
run_query("author:dav")
[ { 'author': 'MacKay, David JC', 'id': 'id-1234', 'title': 'Sustainable Energy - without the hot air', 'year': 2009 } ]
[15]:
run_query("year:2009")
[ { 'author': 'MacKay, David JC', 'id': 'id-1234', 'title': 'Sustainable Energy - without the hot air', 'year': 2009 } ]
Range Query#
You can use the ${field_name}:${comparison_operator}${value}
syntax to do range query on specific field.
[16]:
run_query("year:>2000")
[ { 'author': 'MacKay, David JC', 'id': 'id-1234', 'title': 'Sustainable Energy - without the hot air', 'year': 2009 } ]
[17]:
run_query("year:<2020")
[ { 'author': 'MacKay, David JC', 'id': 'id-1234', 'title': 'Sustainable Energy - without the hot air', 'year': 2009 } ]
[18]:
run_query("year:>2000 AND year:<2020")
[ { 'author': 'MacKay, David JC', 'id': 'id-1234', 'title': 'Sustainable Energy - without the hot air', 'year': 2009 } ]
[19]:
run_query("year:[2000 TO]")
[ { 'author': 'MacKay, David JC', 'id': 'id-1234', 'title': 'Sustainable Energy - without the hot air', 'year': 2009 } ]
[20]:
run_query("year:[TO 2020]")
[ { 'author': 'MacKay, David JC', 'id': 'id-1234', 'title': 'Sustainable Energy - without the hot air', 'year': 2009 } ]
[21]:
run_query("year:[2000 TO 2020]")
[ { 'author': 'MacKay, David JC', 'id': 'id-1234', 'title': 'Sustainable Energy - without the hot air', 'year': 2009 } ]
Logical Operator#
You can use AND
, OR
, NOT
syntax to connect multiple criterions. By default, it is AND
.
[22]:
run_query("title:energy OR author:xyz")
[ { 'author': 'MacKay, David JC', 'id': 'id-1234', 'title': 'Sustainable Energy - without the hot air', 'year': 2009 } ]
[23]:
run_query("title:monster OR author:dav")
[ { 'author': 'MacKay, David JC', 'id': 'id-1234', 'title': 'Sustainable Energy - without the hot air', 'year': 2009 } ]
[24]:
run_query("title:monster AND author:xyz")
[]
Fuzzy Search#
You can use the ${field_name}~${edit_distance}
syntax to do fuzzy search on TextField
.
[25]:
run_query("title:energi~1")
[ { 'author': 'MacKay, David JC', 'id': 'id-1234', 'title': 'Sustainable Energy - without the hot air', 'year': 2009 } ]
ElasticSearch-liked results#
You can set simple_response=False
to return elasticsearch-liked results.
[26]:
rprint(ds.search("David", simple_response=False))
{ 'index': 'my-dataset', 'took': 0, 'size': 1, 'fresh': False, 'cache': False, 'hits': [ { '_id': 0, '_score': -2147485657, '_source': { 'author': 'MacKay, David JC', 'id': 'id-1234', 'title': 'Sustainable Energy - without the hot air', 'year': 2009 } } ] }
Enable logging#
You can set verbose=True
to show detailed log.
[27]:
def downloader_5000_records():
# this time we want to work on a larger dataset
return [
{
"id": uuid.uuid4().hex,
"title": fake.sentence(),
"author": fake.name(),
"year": random.randint(1980, 2020),
}
for _ in range(5000) # 5,000
]
[28]:
ds.downloader = downloader_5000_records
rprint(ds.search("police", limit=3, simple_response=False, verbose=True))
+----- β± π’ π Start 'searching' ------------------------------------------------+
π
π dataset is NOT expired, skip the downloader
π NOT hit query cache!
π preprocessing query ...
π run search on index my-dataset...
π search took: 0 milliseconds
π return: 0 documents
π dataset is fresh: False
π hit cache: False
π
+----- β° π΄ π End 'searching', elapsed = 0.03 sec ------------------------------+
{'index': 'my-dataset', 'took': 0, 'size': 0, 'fresh': False, 'cache': False, 'hits': []}
Query Caching#
The query is automatically cached if the dataset is not expired. You can see that it only takes 0.00 sec if we run the same query again.
[29]:
rprint(ds.search("police", limit=3, simple_response=False, verbose=True))
+----- β± π’ π Start 'searching' ------------------------------------------------+
π
π dataset is NOT expired, skip the downloader
π HIT query cache!
π search took: 0 milliseconds
π return: 0 documents
π dataset is fresh: False
π hit cache: True
π
+----- β° π΄ π End 'searching', elapsed = 0.01 sec ------------------------------+
{'index': 'my-dataset', 'took': 0, 'size': 0, 'fresh': False, 'cache': True, 'hits': []}
Automatically Refresh the Dataset#
You may want to automatically re-download the dataset every X seconds / hours / days, you just need to set the expire time and it will automatically re-run the downloader function when the dataset is expired.
[30]:
ds.downloader = downloader_5000_records
ds.cache_expire = 1
ds.remove_all_index()
ds.remove_all_cache()
print("=== First run, it will download the data ===")
rprint(ds.search("police", limit=1, simple_response=False, verbose=True))
print("=== Second run, it will not download the data ===")
rprint(ds.search("police", limit=1, simple_response=False, verbose=True))
time.sleep(1)
print("=== Third run, it will automatically download the data ===")
rprint(ds.search("police", limit=1, simple_response=False, verbose=True))
=== First run, it will download the data ===
+----- β± π’ π Start 'searching' ------------------------------------------------+
π
π dataset is expired, need to rebuild the index
π +----- β± π’ π Start 'build index' --------------------------------------------+
π π
π π exam the index write lock ...
π π nice, it is not locked, working on indexing ...
π π finished indexing 5000 documents, commit the index.
π π the dataset will expire in 1 seconds.
π π
π +----- β° π΄ π End 'build index', elapsed = 1.16 sec --------------------------+
π NOT hit query cache!
π preprocessing query ...
π run search on index my-dataset...
π search took: 4 milliseconds
π return: 1 documents
π dataset is fresh: True
π hit cache: False
π
+----- β° π΄ π End 'searching', elapsed = 1.64 sec ------------------------------+
{ 'index': 'my-dataset', 'took': 4, 'size': 1, 'fresh': True, 'cache': False, 'hits': [ { '_id': 273, '_score': -2147485668, '_source': { 'author': 'Shirley Lucas', 'id': '2641a8c51b2341a88b97238efeeac352', 'title': 'Home police clearly provide.', 'year': 2020 } } ] }
=== Second run, it will not download the data ===
+----- β± π’ π Start 'searching' ------------------------------------------------+
π
π dataset is NOT expired, skip the downloader
π HIT query cache!
π search took: 4 milliseconds
π return: 1 documents
π dataset is fresh: False
π hit cache: True
π
+----- β° π΄ π End 'searching', elapsed = 0.00 sec ------------------------------+
{ 'index': 'my-dataset', 'took': 4, 'size': 1, 'fresh': False, 'cache': True, 'hits': [ { '_id': 273, '_score': -2147485668, '_source': { 'author': 'Shirley Lucas', 'id': '2641a8c51b2341a88b97238efeeac352', 'title': 'Home police clearly provide.', 'year': 2020 } } ] }
=== Third run, it will automatically download the data ===
+----- β± π’ π Start 'searching' ------------------------------------------------+
π
π dataset is expired, need to rebuild the index
π +----- β± π’ π Start 'build index' --------------------------------------------+
π π
π π exam the index write lock ...
π π nice, it is not locked, working on indexing ...
π π finished indexing 5000 documents, commit the index.
π π the dataset will expire in 1 seconds.
π π
π +----- β° π΄ π End 'build index', elapsed = 0.75 sec --------------------------+
π NOT hit query cache!
π preprocessing query ...
π run search on index my-dataset...
π search took: 4 milliseconds
π return: 1 documents
π dataset is fresh: True
π hit cache: False
π
+----- β° π΄ π End 'searching', elapsed = 1.17 sec ------------------------------+
{ 'index': 'my-dataset', 'took': 4, 'size': 1, 'fresh': True, 'cache': False, 'hits': [ { '_id': 2046, '_score': -2147485668, '_source': { 'author': 'Nicholas Mckenzie', 'id': '257fb33d55b147d0ad8b3d5daa9d35af', 'title': 'Police real author dark realize.', 'year': 2020 } } ] }
Downloader function has parameters#
[40]:
def _downloader(env: str):
return [
{"id": f"id-{i}-{env}", "title": f"my {i}th {env} machine"}
for i in range(1, 1+10)
]
def create_per_environment_dataset(env: str):
def downloader():
return _downloader(env=env)
return DataSet(
dir_index=dir_here.joinpath(".index"), # where the index locate
index_name=f"my-{env}-dataset", # unique name of your dataset
fields=[
# unique ID field
IdField(name="id", stored=True),
# match by token (word) or phrase
TextField(name="title", stored=True),
],
cache=Cache(str(dir_here.joinpath(".cache")), tag_index=True), # where the cache locate
cache_key=f"my-{env}-dataset", # unique cache key for your dataset
cache_expire=10, # how long cache expire (in seconds)
cache_tag=f"my-{env}-dataset", # a tag can be used to do batch delete. if you want to delete cache for many dataset, give them the same tag
downloader=downloader,
)
[41]:
ds = create_per_environment_dataset(env="dev")
ds.remove_all_index() # reset everything before testing
ds.remove_all_cache() # reset everything before testing
rprint(ds.search("dev"))
[ {'id': 'id-1-dev', 'title': 'my 1th dev machine'}, {'id': 'id-2-dev', 'title': 'my 2th dev machine'}, {'id': 'id-3-dev', 'title': 'my 3th dev machine'}, {'id': 'id-4-dev', 'title': 'my 4th dev machine'}, {'id': 'id-5-dev', 'title': 'my 5th dev machine'}, {'id': 'id-6-dev', 'title': 'my 6th dev machine'}, {'id': 'id-7-dev', 'title': 'my 7th dev machine'}, {'id': 'id-8-dev', 'title': 'my 8th dev machine'}, {'id': 'id-9-dev', 'title': 'my 9th dev machine'}, {'id': 'id-10-dev', 'title': 'my 10th dev machine'} ]
[42]:
ds = create_per_environment_dataset(env="prod")
ds.remove_all_index() # reset everything before testing
ds.remove_all_cache() # reset everything before testing
rprint(ds.search("prod"))
[ {'id': 'id-1-prod', 'title': 'my 1th prod machine'}, {'id': 'id-2-prod', 'title': 'my 2th prod machine'}, {'id': 'id-3-prod', 'title': 'my 3th prod machine'}, {'id': 'id-4-prod', 'title': 'my 4th prod machine'}, {'id': 'id-5-prod', 'title': 'my 5th prod machine'}, {'id': 'id-6-prod', 'title': 'my 6th prod machine'}, {'id': 'id-7-prod', 'title': 'my 7th prod machine'}, {'id': 'id-8-prod', 'title': 'my 8th prod machine'}, {'id': 'id-9-prod', 'title': 'my 9th prod machine'}, {'id': 'id-10-prod', 'title': 'my 10th prod machine'} ]
[ ]: