Merge pull request #1469 from bigchaindb/feat/1462/text-search

Text search support for mongodb backend
2024-06-28 08:37:45 +02:00 · 2017-05-29 13:30:45 +02:00 · 2017-05-29 13:30:45 +02:00 · ac2d65d23d
commit ac2d65d23d
parent 45a10a2577 ef52c04808
7 changed files with 239 additions and 2 deletions
--- a/bigchaindb/backend/mongodb/query.py
+++ b/bigchaindb/backend/mongodb/query.py
@ -353,3 +353,28 @@ def get_unvoted_blocks(conn, node_pubkey):
                'votes': False, '_id': False
            }}
        ]))
+
+
+@register_query(MongoDBConnection)
+def text_search(conn, search, *, language='english', case_sensitive=False,
+                diacritic_sensitive=False, text_score=False, limit=0):
+    cursor = conn.run(
+        conn.collection('assets')
+        .find({'$text': {
+                '$search': search,
+                '$language': language,
+                '$caseSensitive': case_sensitive,
+                '$diacriticSensitive': diacritic_sensitive}},
+              {'score': {'$meta': 'textScore'}, '_id': False})
+        .sort([('score', {'$meta': 'textScore'})])
+        .limit(limit))
+
+    if text_score:
+        return cursor
+
+    return (_remove_text_score(asset) for asset in cursor)
+
+
+def _remove_text_score(asset):
+    asset.pop('score', None)
+    return asset
--- a/bigchaindb/backend/mongodb/schema.py
+++ b/bigchaindb/backend/mongodb/schema.py
@ -2,7 +2,7 @@

 import logging

-from pymongo import ASCENDING, DESCENDING
+from pymongo import ASCENDING, DESCENDING, TEXT

 from bigchaindb import backend
 from bigchaindb.common import exceptions
@ -113,3 +113,6 @@ def create_assets_secondary_index(conn, dbname):
    conn.conn[dbname]['assets'].create_index('id',
                                             name='asset_id',
                                             unique=True)
+
+    # full text search index
+    conn.conn[dbname]['assets'].create_index([('$**', TEXT)], name='text')
--- a/bigchaindb/backend/query.py
+++ b/bigchaindb/backend/query.py
@ -2,6 +2,8 @@

 from functools import singledispatch

+from bigchaindb.backend.exceptions import OperationError
+

@singledispatch
 def write_transaction(connection, signed_transaction):
@ -353,3 +355,33 @@ def get_txids_filtered(connection, asset_id, operation=None):
    """

    raise NotImplementedError
+
+
+@singledispatch
+def text_search(conn, search, *, language='english', case_sensitive=False,
+                diacritic_sensitive=False, text_score=False, limit=0):
+    """Return all the assets that match the text search.
+
+    The results are sorted by text score.
+    For more information about the behavior of text search on MongoDB see
+    https://docs.mongodb.com/manual/reference/operator/query/text/#behavior
+
+    Args:
+        search (str): Text search string to query the text index
+        language (str, optional): The language for the search and the rules for
+            stemmer and tokenizer. If the language is ``None`` text search uses
+            simple tokenization and no stemming.
+        case_sensitive (bool, optional): Enable or disable case sensitive
+            search.
+        diacritic_sensitive (bool, optional): Enable or disable case sensitive
+            diacritic search.
+        text_score (bool, optional): If ``True`` returns the text score with
+            each document.
+        limit (int, optional): Limit the number of returned documents.
+
+    Returns:
+        :obj:`list` of :obj:`dict`: a list of assets
+    """
+
+    raise OperationError('This query is only supported when running '
+                         'BigchainDB with MongoDB as the backend.')
--- a/bigchaindb/core.py
+++ b/bigchaindb/core.py
@ -619,3 +619,14 @@ class Bigchain(object):
                the database.
        """
        return backend.query.write_assets(self.connection, assets)
+
+    def text_search(self, search, *, limit=0):
+        assets = backend.query.text_search(self.connection, search, limit=limit)
+
+        # TODO: This is not efficient. There may be a more efficient way to
+        #       query by storing block ids with the assets and using fastquery.
+        #       See https://github.com/bigchaindb/bigchaindb/issues/1496
+        for asset in assets:
+            tx, status = self.get_transaction(asset['id'], True)
+            if status == self.TX_VALID:
+                yield asset
--- a/tests/backend/mongodb/test_queries.py
+++ b/tests/backend/mongodb/test_queries.py
@ -513,3 +513,87 @@ def test_get_assets():

    assert cursor.count() == 2
    assert list(cursor.sort('id', pymongo.ASCENDING)) == assets[::2]
+
+
+def test_text_search():
+    from bigchaindb.backend import connect, query
+    conn = connect()
+
+    # Example data and tests cases taken from the mongodb documentation
+    # https://docs.mongodb.com/manual/reference/operator/query/text/
+    assets = [
+        {'id': 1, 'subject': 'coffee', 'author': 'xyz', 'views': 50},
+        {'id': 2, 'subject': 'Coffee Shopping', 'author': 'efg', 'views': 5},
+        {'id': 3, 'subject': 'Baking a cake', 'author': 'abc', 'views': 90},
+        {'id': 4, 'subject': 'baking', 'author': 'xyz', 'views': 100},
+        {'id': 5, 'subject': 'Café Con Leche', 'author': 'abc', 'views': 200},
+        {'id': 6, 'subject': 'Сырники', 'author': 'jkl', 'views': 80},
+        {'id': 7, 'subject': 'coffee and cream', 'author': 'efg', 'views': 10},
+        {'id': 8, 'subject': 'Cafe con Leche', 'author': 'xyz', 'views': 10}
+    ]
+
+    # insert the assets
+    conn.db.assets.insert_many(deepcopy(assets), ordered=False)
+
+    # test search single word
+    assert list(query.text_search(conn, 'coffee')) == [
+        {'id': 1, 'subject': 'coffee', 'author': 'xyz', 'views': 50},
+        {'id': 2, 'subject': 'Coffee Shopping', 'author': 'efg', 'views': 5},
+        {'id': 7, 'subject': 'coffee and cream', 'author': 'efg', 'views': 10},
+    ]
+
+    # match any of the search terms
+    assert list(query.text_search(conn, 'bake coffee cake')) == [
+        {'author': 'abc', 'id': 3, 'subject': 'Baking a cake', 'views': 90},
+        {'author': 'xyz', 'id': 1, 'subject': 'coffee', 'views': 50},
+        {'author': 'xyz', 'id': 4, 'subject': 'baking', 'views': 100},
+        {'author': 'efg', 'id': 2, 'subject': 'Coffee Shopping', 'views': 5},
+        {'author': 'efg', 'id': 7, 'subject': 'coffee and cream', 'views': 10}
+    ]
+
+    # search for a phrase
+    assert list(query.text_search(conn, '\"coffee shop\"')) == [
+        {'id': 2, 'subject': 'Coffee Shopping', 'author': 'efg', 'views': 5},
+    ]
+
+    # exclude documents that contain a term
+    assert list(query.text_search(conn, 'coffee -shop')) == [
+        {'id': 1, 'subject': 'coffee', 'author': 'xyz', 'views': 50},
+        {'id': 7, 'subject': 'coffee and cream', 'author': 'efg', 'views': 10},
+    ]
+
+    # search different language
+    assert list(query.text_search(conn, 'leche', language='es')) == [
+        {'id': 5, 'subject': 'Café Con Leche', 'author': 'abc', 'views': 200},
+        {'id': 8, 'subject': 'Cafe con Leche', 'author': 'xyz', 'views': 10}
+    ]
+
+    # case and diacritic insensitive search
+    assert list(query.text_search(conn, 'сы́рники CAFÉS')) == [
+        {'id': 6, 'subject': 'Сырники', 'author': 'jkl', 'views': 80},
+        {'id': 5, 'subject': 'Café Con Leche', 'author': 'abc', 'views': 200},
+        {'id': 8, 'subject': 'Cafe con Leche', 'author': 'xyz', 'views': 10}
+    ]
+
+    # case sensitive search
+    assert list(query.text_search(conn, 'Coffee', case_sensitive=True)) == [
+        {'id': 2, 'subject': 'Coffee Shopping', 'author': 'efg', 'views': 5},
+    ]
+
+    # diacritic sensitive search
+    assert list(query.text_search(conn, 'CAFÉ', diacritic_sensitive=True)) == [
+        {'id': 5, 'subject': 'Café Con Leche', 'author': 'abc', 'views': 200},
+    ]
+
+    # return text score
+    assert list(query.text_search(conn, 'coffee', text_score=True)) == [
+        {'id': 1, 'subject': 'coffee', 'author': 'xyz', 'views': 50, 'score': 1.0},
+        {'id': 2, 'subject': 'Coffee Shopping', 'author': 'efg', 'views': 5, 'score': 0.75},
+        {'id': 7, 'subject': 'coffee and cream', 'author': 'efg', 'views': 10, 'score': 0.75},
+    ]
+
+    # limit search result
+    assert list(query.text_search(conn, 'coffee', limit=2)) == [
+        {'id': 1, 'subject': 'coffee', 'author': 'xyz', 'views': 50},
+        {'id': 2, 'subject': 'Coffee Shopping', 'author': 'efg', 'views': 5},
+    ]
--- a/tests/backend/mongodb/test_schema.py
+++ b/tests/backend/mongodb/test_schema.py
@ -33,7 +33,7 @@ def test_init_creates_db_tables_and_indexes():
    assert sorted(indexes) == ['_id_', 'block_and_voter']

    indexes = conn.conn[dbname]['assets'].index_information().keys()
-    assert sorted(indexes) == ['_id_', 'asset_id']
+    assert sorted(indexes) == ['_id_', 'asset_id', 'text']


 def test_init_database_fails_if_db_exists():
--- a/tests/db/test_bigchain_api.py
+++ b/tests/db/test_bigchain_api.py
@ -213,6 +213,88 @@ class TestBigchainApi(object):
        assert b.get_transaction(tx1.id) is None
        assert b.get_transaction(tx2.id) == tx2

+    @pytest.mark.genesis
+    def test_text_search(self, b):
+        from bigchaindb.models import Transaction
+        from bigchaindb.backend.exceptions import OperationError
+        from bigchaindb.backend.mongodb.connection import MongoDBConnection
+
+        # define the assets
+        asset1 = {'msg': 'BigchainDB 1'}
+        asset2 = {'msg': 'BigchainDB 2'}
+        asset3 = {'msg': 'BigchainDB 3'}
+
+        # create the transactions
+        tx1 = Transaction.create([b.me], [([b.me], 1)],
+                                 asset=asset1).sign([b.me_private])
+        tx2 = Transaction.create([b.me], [([b.me], 1)],
+                                 asset=asset2).sign([b.me_private])
+        tx3 = Transaction.create([b.me], [([b.me], 1)],
+                                 asset=asset3).sign([b.me_private])
+
+        # create the block
+        block = b.create_block([tx1, tx2, tx3])
+        b.write_block(block)
+
+        # vote valid
+        vote = b.vote(block.id, b.get_last_voted_block().id, True)
+        b.write_vote(vote)
+
+        # get the assets through text search
+        # this query only works with MongoDB
+        try:
+            assets = list(b.text_search('bigchaindb'))
+        except OperationError as exc:
+            assert not isinstance(b.connection, MongoDBConnection)
+        else:
+            assert len(assets) == 3
+
+    @pytest.mark.genesis
+    def test_text_search_returns_valid_only(self, monkeypatch, b):
+        from bigchaindb.models import Transaction
+        from bigchaindb.backend.exceptions import OperationError
+        from bigchaindb.backend.mongodb.connection import MongoDBConnection
+
+        asset_valid = {'msg': 'Hello BigchainDB!'}
+        asset_invalid = {'msg': 'Goodbye BigchainDB!'}
+
+        monkeypatch.setattr('time.time', lambda: 1000000000)
+        tx1 = Transaction.create([b.me], [([b.me], 1)],
+                                 asset=asset_valid)
+        tx1 = tx1.sign([b.me_private])
+        block1 = b.create_block([tx1])
+        b.write_block(block1)
+
+        monkeypatch.setattr('time.time', lambda: 1000000020)
+        tx2 = Transaction.create([b.me], [([b.me], 1)],
+                                 asset=asset_invalid)
+        tx2 = tx2.sign([b.me_private])
+        block2 = b.create_block([tx2])
+        b.write_block(block2)
+
+        # vote the first block valid
+        vote = b.vote(block1.id, b.get_last_voted_block().id, True)
+        b.write_vote(vote)
+
+        # vote the second block invalid
+        vote = b.vote(block2.id, b.get_last_voted_block().id, False)
+        b.write_vote(vote)
+
+        # get assets with text search
+        try:
+            assets = list(b.text_search('bigchaindb'))
+        except OperationError:
+            assert not isinstance(b.connection, MongoDBConnection)
+            return
+
+        # should only return one asset
+        assert len(assets) == 1
+        # should return the asset created by tx1
+        assert assets[0] == {
+            'data': {'msg': 'Hello BigchainDB!'},
+            'id': tx1.id
+        }
+
    @pytest.mark.usefixtures('inputs')
    def test_write_transaction(self, b, user_pk, user_sk):
        from bigchaindb import Bigchain