From ccaae91601d174dbede2102458410597ec14a8ed Mon Sep 17 00:00:00 2001 From: Rodolphe Marques Date: Fri, 12 May 2017 15:22:11 +0200 Subject: [PATCH] Added full text search support for mongodb. - Create widlcard text index for the assets collection. - Created backend query to to text search on assets collection. - Added and updated tests. --- bigchaindb/backend/mongodb/query.py | 25 ++++++++ bigchaindb/backend/mongodb/schema.py | 5 +- bigchaindb/backend/query.py | 8 +++ tests/backend/mongodb/test_queries.py | 84 +++++++++++++++++++++++++++ tests/backend/mongodb/test_schema.py | 2 +- 5 files changed, 122 insertions(+), 2 deletions(-) diff --git a/bigchaindb/backend/mongodb/query.py b/bigchaindb/backend/mongodb/query.py index 39d99d4a..3d989a3d 100644 --- a/bigchaindb/backend/mongodb/query.py +++ b/bigchaindb/backend/mongodb/query.py @@ -327,3 +327,28 @@ def get_unvoted_blocks(conn, node_pubkey): 'votes': False, '_id': False }} ])) + + +@register_query(MongoDBConnection) +def text_search(conn, search, language='english', case_sensitive=False, + diacritic_sensitive=False, text_score=False, limit=0): + cursor = conn.run( + conn.collection('assets') + .find({'$text': { + '$search': search, + '$language': language, + '$caseSensitive': case_sensitive, + '$diacriticSensitive': diacritic_sensitive}}, + {'score': {'$meta': 'textScore'}, '_id': False}) + .sort([('score', {'$meta': 'textScore'})]) + .limit(limit)) + + if text_score: + return cursor + else: + return (_remove_text_score(asset) for asset in cursor) + + +def _remove_text_score(asset): + asset.pop('score', None) + return asset diff --git a/bigchaindb/backend/mongodb/schema.py b/bigchaindb/backend/mongodb/schema.py index 12b873e0..6c54bfd8 100644 --- a/bigchaindb/backend/mongodb/schema.py +++ b/bigchaindb/backend/mongodb/schema.py @@ -2,7 +2,7 @@ import logging -from pymongo import ASCENDING, DESCENDING +from pymongo import ASCENDING, DESCENDING, TEXT from bigchaindb import backend from bigchaindb.common import exceptions @@ -113,3 +113,6 @@ def create_assets_secondary_index(conn, dbname): conn.conn[dbname]['assets'].create_index('id', name='asset_id', unique=True) + + # full text search index + conn.conn[dbname]['assets'].create_index([('$**', TEXT)], name='text') diff --git a/bigchaindb/backend/query.py b/bigchaindb/backend/query.py index 8245fb3d..705b0306 100644 --- a/bigchaindb/backend/query.py +++ b/bigchaindb/backend/query.py @@ -325,3 +325,11 @@ def get_txids_filtered(connection, asset_id, operation=None): """ raise NotImplementedError + + +@singledispatch +def text_search(conn, search, language='english', case_sensitive=False, + diacritic_sensitive=False, text_score=False, limit=0): + # TODO: docstring + + raise NotImplementedError diff --git a/tests/backend/mongodb/test_queries.py b/tests/backend/mongodb/test_queries.py index c43c5fa4..9b2ad1cf 100644 --- a/tests/backend/mongodb/test_queries.py +++ b/tests/backend/mongodb/test_queries.py @@ -464,3 +464,87 @@ def test_get_assets(): assert cursor.count() == 2 assert list(cursor.sort('id', pymongo.ASCENDING)) == assets[::2] + + +def test_text_search(): + from bigchaindb.backend import connect, query + conn = connect() + + # Example data and tests cases taken from the mongodb documentation + # https://docs.mongodb.com/manual/reference/operator/query/text/ + assets = [ + {'id': 1, 'subject': 'coffee', 'author': 'xyz', 'views': 50}, + {'id': 2, 'subject': 'Coffee Shopping', 'author': 'efg', 'views': 5}, + {'id': 3, 'subject': 'Baking a cake', 'author': 'abc', 'views': 90}, + {'id': 4, 'subject': 'baking', 'author': 'xyz', 'views': 100}, + {'id': 5, 'subject': 'Café Con Leche', 'author': 'abc', 'views': 200}, + {'id': 6, 'subject': 'Сырники', 'author': 'jkl', 'views': 80}, + {'id': 7, 'subject': 'coffee and cream', 'author': 'efg', 'views': 10}, + {'id': 8, 'subject': 'Cafe con Leche', 'author': 'xyz', 'views': 10} + ] + + # insert the assets + conn.db.assets.insert_many(deepcopy(assets), ordered=False) + + # test search single word + assert list(query.text_search(conn, 'coffee')) == [ + {'id': 1, 'subject': 'coffee', 'author': 'xyz', 'views': 50}, + {'id': 2, 'subject': 'Coffee Shopping', 'author': 'efg', 'views': 5}, + {'id': 7, 'subject': 'coffee and cream', 'author': 'efg', 'views': 10}, + ] + + # match any of the search terms + assert list(query.text_search(conn, 'bake coffee cake')) == [ + {'author': 'abc', 'id': 3, 'subject': 'Baking a cake', 'views': 90}, + {'author': 'xyz', 'id': 1, 'subject': 'coffee', 'views': 50}, + {'author': 'xyz', 'id': 4, 'subject': 'baking', 'views': 100}, + {'author': 'efg', 'id': 2, 'subject': 'Coffee Shopping', 'views': 5}, + {'author': 'efg', 'id': 7, 'subject': 'coffee and cream', 'views': 10} + ] + + # search for a phrase + assert list(query.text_search(conn, '\"coffee shop\"')) == [ + {'id': 2, 'subject': 'Coffee Shopping', 'author': 'efg', 'views': 5}, + ] + + # exclude documents that contain a term + assert list(query.text_search(conn, 'coffee -shop')) == [ + {'id': 1, 'subject': 'coffee', 'author': 'xyz', 'views': 50}, + {'id': 7, 'subject': 'coffee and cream', 'author': 'efg', 'views': 10}, + ] + + # search different language + assert list(query.text_search(conn, 'leche', language='es')) == [ + {'id': 5, 'subject': 'Café Con Leche', 'author': 'abc', 'views': 200}, + {'id': 8, 'subject': 'Cafe con Leche', 'author': 'xyz', 'views': 10} + ] + + # case and diacritic insensitive search + assert list(query.text_search(conn, 'сы́рники CAFÉS')) == [ + {'id': 6, 'subject': 'Сырники', 'author': 'jkl', 'views': 80}, + {'id': 5, 'subject': 'Café Con Leche', 'author': 'abc', 'views': 200}, + {'id': 8, 'subject': 'Cafe con Leche', 'author': 'xyz', 'views': 10} + ] + + # case sensitive search + assert list(query.text_search(conn, 'Coffee', case_sensitive=True)) == [ + {'id': 2, 'subject': 'Coffee Shopping', 'author': 'efg', 'views': 5}, + ] + + # diacritic sensitive search + assert list(query.text_search(conn, 'CAFÉ', diacritic_sensitive=True)) == [ + {'id': 5, 'subject': 'Café Con Leche', 'author': 'abc', 'views': 200}, + ] + + # return text score + assert list(query.text_search(conn, 'coffee', text_score=True)) == [ + {'id': 1, 'subject': 'coffee', 'author': 'xyz', 'views': 50, 'score': 1.0}, + {'id': 2, 'subject': 'Coffee Shopping', 'author': 'efg', 'views': 5, 'score': 0.75}, + {'id': 7, 'subject': 'coffee and cream', 'author': 'efg', 'views': 10, 'score': 0.75}, + ] + + # limit search result + assert list(query.text_search(conn, 'coffee', limit=2)) == [ + {'id': 1, 'subject': 'coffee', 'author': 'xyz', 'views': 50}, + {'id': 2, 'subject': 'Coffee Shopping', 'author': 'efg', 'views': 5}, + ] diff --git a/tests/backend/mongodb/test_schema.py b/tests/backend/mongodb/test_schema.py index e3b320bd..e11dbfe8 100644 --- a/tests/backend/mongodb/test_schema.py +++ b/tests/backend/mongodb/test_schema.py @@ -33,7 +33,7 @@ def test_init_creates_db_tables_and_indexes(): assert sorted(indexes) == ['_id_', 'block_and_voter'] indexes = conn.conn[dbname]['assets'].index_information().keys() - assert sorted(indexes) == ['_id_', 'asset_id'] + assert sorted(indexes) == ['_id_', 'asset_id', 'text'] def test_init_database_fails_if_db_exists():