Usage Example#

For the comprehensive guide of the query language, read this: https://whoosh.readthedocs.io/en/latest/querylang.html

Import sayt#

[1]:
from sayt.api import (
    DataSet,
    IdField,
    TextField,
    NumericField,
    NgramField,
    NgramWordsField,
    StoredField,
    T_DOCUMENT,
    T_DOWNLOADER,
)
[2]:
import typing as T
import os
import time
import uuid
import random
from pathlib import Path

import faker
from fixa.timer import DateTimeTimer
from diskcache import Cache
from rich import print as rprint
[3]:
dir_here = Path(os.getcwd())

fake = faker.Faker()

Define your dataset schema#

Let’s say our dataset is about a details of a book.

{
    "id": "id-1234",
    "title": "Sustainable Energy - without the hot air",
    "author": "MacKay, David JC",
    "year": 2009,
}
  • We want to match id only if the query match the id exactly.

  • We want to match title when words in the query match the word in the title, case-insensitive.

  • We want to match author when any ngram characters in the query match the author name.

  • We want to use range query to filter on year.

[4]:
fields = [
    # unique ID field
    IdField(name="id", stored=True),
    # match by token (word) or phrase
    TextField(name="title", stored=True),
    # match by n-gram characters
    NgramField(
        name="author",
        stored=True,
        minsize=2,
        maxsize=6,
    ),
    # range query
    NumericField(
        name="year",
        stored=True,
        sortable=True,
        ascending=False,
    ),
    # range query
    StoredField(
        name="raw",
    ),
]

Define the downloader function#

Downloader is a function with zero arguments that returns a list of searchable documents

[5]:
def downloader():
    return [
        {
            "id": "id-1234",
            "title": "Sustainable Energy - without the hot air",
            "author": "MacKay, David JC",
            "year": 2009,
        },
    ]

Define the Dataset Object#

Dataset is an abstraction of a searchable dataset. It defines how you want to index and search your dataset, how to download your dataset, and where to store the index and cache.

[6]:
ds = DataSet(
    dir_index=dir_here.joinpath(".index"), # where the index locate
    index_name="my-dataset", # unique name of your dataset
    fields=fields,
    cache=Cache(str(dir_here.joinpath(".cache")), tag_index=True), # where the cache locate
    cache_key="my-dataset", # unique cache key for your dataset
    cache_expire=10, # how long cache expire (in seconds)
    cache_tag="my-dataset", # a tag can be used to do batch delete. if you want to delete cache for many dataset, give them the same tag
    downloader=downloader,
)
ds.remove_all_index() # reset everything before testing
ds.remove_all_cache() # reset everything before testing

Play with the search method#

DataSet.search method is the main API that performs the search, handles the caching, dataset refreshing and all the details.

[7]:
def run_query(query, limit: int=5, simple_response: bool=True):
    res = ds.search(query, limit=limit, simple_response=simple_response)
    rprint(res)

Multi Field Match#

By default, sayt try to match the query in all searable fields.

[8]:
run_query("id-1234")
[
    {
        'author': 'MacKay, David JC',
        'id': 'id-1234',
        'title': 'Sustainable Energy - without the hot air',
        'year': 2009
    }
]
[9]:
run_query("energy")
[
    {
        'author': 'MacKay, David JC',
        'id': 'id-1234',
        'title': 'Sustainable Energy - without the hot air',
        'year': 2009
    }
]
[10]:
run_query("dav")
[
    {
        'author': 'MacKay, David JC',
        'id': 'id-1234',
        'title': 'Sustainable Energy - without the hot air',
        'year': 2009
    }
]
[11]:
run_query("2009")
[
    {
        'author': 'MacKay, David JC',
        'id': 'id-1234',
        'title': 'Sustainable Energy - without the hot air',
        'year': 2009
    }
]

Specify the Field you want to match#

You can use the ${field_name}:${query} syntax to search on specific field.

[12]:
run_query("id:id-1234")
[
    {
        'author': 'MacKay, David JC',
        'id': 'id-1234',
        'title': 'Sustainable Energy - without the hot air',
        'year': 2009
    }
]
[13]:
run_query("title:energy")
[
    {
        'author': 'MacKay, David JC',
        'id': 'id-1234',
        'title': 'Sustainable Energy - without the hot air',
        'year': 2009
    }
]
[14]:
run_query("author:dav")
[
    {
        'author': 'MacKay, David JC',
        'id': 'id-1234',
        'title': 'Sustainable Energy - without the hot air',
        'year': 2009
    }
]
[15]:
run_query("year:2009")
[
    {
        'author': 'MacKay, David JC',
        'id': 'id-1234',
        'title': 'Sustainable Energy - without the hot air',
        'year': 2009
    }
]

Range Query#

You can use the ${field_name}:${comparison_operator}${value} syntax to do range query on specific field.

[16]:
run_query("year:>2000")
[
    {
        'author': 'MacKay, David JC',
        'id': 'id-1234',
        'title': 'Sustainable Energy - without the hot air',
        'year': 2009
    }
]
[17]:
run_query("year:<2020")
[
    {
        'author': 'MacKay, David JC',
        'id': 'id-1234',
        'title': 'Sustainable Energy - without the hot air',
        'year': 2009
    }
]
[18]:
run_query("year:>2000 AND year:<2020")
[
    {
        'author': 'MacKay, David JC',
        'id': 'id-1234',
        'title': 'Sustainable Energy - without the hot air',
        'year': 2009
    }
]
[19]:
run_query("year:[2000 TO]")
[
    {
        'author': 'MacKay, David JC',
        'id': 'id-1234',
        'title': 'Sustainable Energy - without the hot air',
        'year': 2009
    }
]
[20]:
run_query("year:[TO 2020]")
[
    {
        'author': 'MacKay, David JC',
        'id': 'id-1234',
        'title': 'Sustainable Energy - without the hot air',
        'year': 2009
    }
]
[21]:
run_query("year:[2000 TO 2020]")
[
    {
        'author': 'MacKay, David JC',
        'id': 'id-1234',
        'title': 'Sustainable Energy - without the hot air',
        'year': 2009
    }
]

Logical Operator#

You can use AND, OR, NOT syntax to connect multiple criterions. By default, it is AND.

[22]:
run_query("title:energy OR author:xyz")
[
    {
        'author': 'MacKay, David JC',
        'id': 'id-1234',
        'title': 'Sustainable Energy - without the hot air',
        'year': 2009
    }
]
[23]:
run_query("title:monster OR author:dav")
[
    {
        'author': 'MacKay, David JC',
        'id': 'id-1234',
        'title': 'Sustainable Energy - without the hot air',
        'year': 2009
    }
]
[24]:
run_query("title:monster AND author:xyz")
[]

ElasticSearch-liked results#

You can set simple_response=False to return elasticsearch-liked results.

[26]:
rprint(ds.search("David", simple_response=False))
{
    'index': 'my-dataset',
    'took': 0,
    'size': 1,
    'fresh': False,
    'cache': False,
    'hits': [
        {
            '_id': 0,
            '_score': -2147485657,
            '_source': {
                'author': 'MacKay, David JC',
                'id': 'id-1234',
                'title': 'Sustainable Energy - without the hot air',
                'year': 2009
            }
        }
    ]
}

Enable logging#

You can set verbose=True to show detailed log.

[27]:
def downloader_5000_records():
    # this time we want to work on a larger dataset
    return [
        {
            "id": uuid.uuid4().hex,
            "title": fake.sentence(),
            "author": fake.name(),
            "year": random.randint(1980, 2020),
        }
        for _ in range(5000) # 5,000
    ]
[28]:
ds.downloader = downloader_5000_records
rprint(ds.search("police", limit=3, simple_response=False, verbose=True))
+----- ⏱ 🟒 πŸ”Ž Start 'searching' ------------------------------------------------+
πŸ”Ž
πŸ”Ž dataset is NOT expired, skip the downloader
πŸ”Ž NOT hit query cache!
πŸ”Ž preprocessing query ...
πŸ”Ž run search on index my-dataset...
πŸ”Ž   search took: 0 milliseconds
πŸ”Ž   return: 0 documents
πŸ”Ž   dataset is fresh: False
πŸ”Ž   hit cache: False
πŸ”Ž
+----- ⏰ πŸ”΄ πŸ”Ž End 'searching', elapsed = 0.03 sec ------------------------------+
{'index': 'my-dataset', 'took': 0, 'size': 0, 'fresh': False, 'cache': False, 'hits': []}

Query Caching#

The query is automatically cached if the dataset is not expired. You can see that it only takes 0.00 sec if we run the same query again.

[29]:
rprint(ds.search("police", limit=3, simple_response=False, verbose=True))
+----- ⏱ 🟒 πŸ”Ž Start 'searching' ------------------------------------------------+
πŸ”Ž
πŸ”Ž dataset is NOT expired, skip the downloader
πŸ”Ž HIT query cache!
πŸ”Ž   search took: 0 milliseconds
πŸ”Ž   return: 0 documents
πŸ”Ž   dataset is fresh: False
πŸ”Ž   hit cache: True
πŸ”Ž
+----- ⏰ πŸ”΄ πŸ”Ž End 'searching', elapsed = 0.01 sec ------------------------------+
{'index': 'my-dataset', 'took': 0, 'size': 0, 'fresh': False, 'cache': True, 'hits': []}

Automatically Refresh the Dataset#

You may want to automatically re-download the dataset every X seconds / hours / days, you just need to set the expire time and it will automatically re-run the downloader function when the dataset is expired.

[30]:
ds.downloader = downloader_5000_records
ds.cache_expire = 1
ds.remove_all_index()
ds.remove_all_cache()

print("=== First run, it will download the data ===")
rprint(ds.search("police", limit=1, simple_response=False, verbose=True))

print("=== Second run, it will not download the data ===")
rprint(ds.search("police", limit=1, simple_response=False, verbose=True))

time.sleep(1)
print("=== Third run, it will automatically download the data ===")
rprint(ds.search("police", limit=1, simple_response=False, verbose=True))
=== First run, it will download the data ===
+----- ⏱ 🟒 πŸ”Ž Start 'searching' ------------------------------------------------+
πŸ”Ž
πŸ”Ž dataset is expired, need to rebuild the index
πŸ”Ž +----- ⏱ 🟒 πŸ— Start 'build index' --------------------------------------------+
πŸ”Ž πŸ—
πŸ”Ž πŸ— exam the index write lock ...
πŸ”Ž πŸ—   nice, it is not locked, working on indexing ...
πŸ”Ž πŸ—     finished indexing 5000 documents, commit the index.
πŸ”Ž πŸ—     the dataset will expire in 1 seconds.
πŸ”Ž πŸ—
πŸ”Ž +----- ⏰ πŸ”΄ πŸ— End 'build index', elapsed = 1.16 sec --------------------------+
πŸ”Ž NOT hit query cache!
πŸ”Ž preprocessing query ...
πŸ”Ž run search on index my-dataset...
πŸ”Ž   search took: 4 milliseconds
πŸ”Ž   return: 1 documents
πŸ”Ž   dataset is fresh: True
πŸ”Ž   hit cache: False
πŸ”Ž
+----- ⏰ πŸ”΄ πŸ”Ž End 'searching', elapsed = 1.64 sec ------------------------------+
{
    'index': 'my-dataset',
    'took': 4,
    'size': 1,
    'fresh': True,
    'cache': False,
    'hits': [
        {
            '_id': 273,
            '_score': -2147485668,
            '_source': {
                'author': 'Shirley Lucas',
                'id': '2641a8c51b2341a88b97238efeeac352',
                'title': 'Home police clearly provide.',
                'year': 2020
            }
        }
    ]
}
=== Second run, it will not download the data ===
+----- ⏱ 🟒 πŸ”Ž Start 'searching' ------------------------------------------------+
πŸ”Ž
πŸ”Ž dataset is NOT expired, skip the downloader
πŸ”Ž HIT query cache!
πŸ”Ž   search took: 4 milliseconds
πŸ”Ž   return: 1 documents
πŸ”Ž   dataset is fresh: False
πŸ”Ž   hit cache: True
πŸ”Ž
+----- ⏰ πŸ”΄ πŸ”Ž End 'searching', elapsed = 0.00 sec ------------------------------+
{
    'index': 'my-dataset',
    'took': 4,
    'size': 1,
    'fresh': False,
    'cache': True,
    'hits': [
        {
            '_id': 273,
            '_score': -2147485668,
            '_source': {
                'author': 'Shirley Lucas',
                'id': '2641a8c51b2341a88b97238efeeac352',
                'title': 'Home police clearly provide.',
                'year': 2020
            }
        }
    ]
}
=== Third run, it will automatically download the data ===
+----- ⏱ 🟒 πŸ”Ž Start 'searching' ------------------------------------------------+
πŸ”Ž
πŸ”Ž dataset is expired, need to rebuild the index
πŸ”Ž +----- ⏱ 🟒 πŸ— Start 'build index' --------------------------------------------+
πŸ”Ž πŸ—
πŸ”Ž πŸ— exam the index write lock ...
πŸ”Ž πŸ—   nice, it is not locked, working on indexing ...
πŸ”Ž πŸ—     finished indexing 5000 documents, commit the index.
πŸ”Ž πŸ—     the dataset will expire in 1 seconds.
πŸ”Ž πŸ—
πŸ”Ž +----- ⏰ πŸ”΄ πŸ— End 'build index', elapsed = 0.75 sec --------------------------+
πŸ”Ž NOT hit query cache!
πŸ”Ž preprocessing query ...
πŸ”Ž run search on index my-dataset...
πŸ”Ž   search took: 4 milliseconds
πŸ”Ž   return: 1 documents
πŸ”Ž   dataset is fresh: True
πŸ”Ž   hit cache: False
πŸ”Ž
+----- ⏰ πŸ”΄ πŸ”Ž End 'searching', elapsed = 1.17 sec ------------------------------+
{
    'index': 'my-dataset',
    'took': 4,
    'size': 1,
    'fresh': True,
    'cache': False,
    'hits': [
        {
            '_id': 2046,
            '_score': -2147485668,
            '_source': {
                'author': 'Nicholas Mckenzie',
                'id': '257fb33d55b147d0ad8b3d5daa9d35af',
                'title': 'Police real author dark realize.',
                'year': 2020
            }
        }
    ]
}

Downloader function has parameters#

[40]:
def _downloader(env: str):
    return [
        {"id": f"id-{i}-{env}", "title": f"my {i}th {env} machine"}
        for i in range(1, 1+10)
    ]


def create_per_environment_dataset(env: str):
    def downloader():
        return _downloader(env=env)
    return DataSet(
        dir_index=dir_here.joinpath(".index"), # where the index locate
        index_name=f"my-{env}-dataset", # unique name of your dataset
        fields=[
            # unique ID field
            IdField(name="id", stored=True),
            # match by token (word) or phrase
            TextField(name="title", stored=True),
        ],
        cache=Cache(str(dir_here.joinpath(".cache")), tag_index=True), # where the cache locate
        cache_key=f"my-{env}-dataset", # unique cache key for your dataset
        cache_expire=10, # how long cache expire (in seconds)
        cache_tag=f"my-{env}-dataset", # a tag can be used to do batch delete. if you want to delete cache for many dataset, give them the same tag
        downloader=downloader,
    )
[41]:
ds = create_per_environment_dataset(env="dev")
ds.remove_all_index() # reset everything before testing
ds.remove_all_cache() # reset everything before testing
rprint(ds.search("dev"))
[
    {'id': 'id-1-dev', 'title': 'my 1th dev machine'},
    {'id': 'id-2-dev', 'title': 'my 2th dev machine'},
    {'id': 'id-3-dev', 'title': 'my 3th dev machine'},
    {'id': 'id-4-dev', 'title': 'my 4th dev machine'},
    {'id': 'id-5-dev', 'title': 'my 5th dev machine'},
    {'id': 'id-6-dev', 'title': 'my 6th dev machine'},
    {'id': 'id-7-dev', 'title': 'my 7th dev machine'},
    {'id': 'id-8-dev', 'title': 'my 8th dev machine'},
    {'id': 'id-9-dev', 'title': 'my 9th dev machine'},
    {'id': 'id-10-dev', 'title': 'my 10th dev machine'}
]
[42]:
ds = create_per_environment_dataset(env="prod")
ds.remove_all_index() # reset everything before testing
ds.remove_all_cache() # reset everything before testing
rprint(ds.search("prod"))
[
    {'id': 'id-1-prod', 'title': 'my 1th prod machine'},
    {'id': 'id-2-prod', 'title': 'my 2th prod machine'},
    {'id': 'id-3-prod', 'title': 'my 3th prod machine'},
    {'id': 'id-4-prod', 'title': 'my 4th prod machine'},
    {'id': 'id-5-prod', 'title': 'my 5th prod machine'},
    {'id': 'id-6-prod', 'title': 'my 6th prod machine'},
    {'id': 'id-7-prod', 'title': 'my 7th prod machine'},
    {'id': 'id-8-prod', 'title': 'my 8th prod machine'},
    {'id': 'id-9-prod', 'title': 'my 9th prod machine'},
    {'id': 'id-10-prod', 'title': 'my 10th prod machine'}
]
[ ]: