From ccaae91601d174dbede2102458410597ec14a8ed Mon Sep 17 00:00:00 2001
From: Rodolphe Marques <marques.rodolphe@gmail.com>
Date: Fri, 12 May 2017 15:22:11 +0200
Subject: [PATCH] Added full text search support for mongodb.

- Create widlcard text index for the assets collection.
- Created backend query to to text search on assets collection.
- Added and updated tests.
---
 bigchaindb/backend/mongodb/query.py   | 25 ++++++++
 bigchaindb/backend/mongodb/schema.py  |  5 +-
 bigchaindb/backend/query.py           |  8 +++
 tests/backend/mongodb/test_queries.py | 84 +++++++++++++++++++++++++++
 tests/backend/mongodb/test_schema.py  |  2 +-
 5 files changed, 122 insertions(+), 2 deletions(-)

diff --git a/bigchaindb/backend/mongodb/query.py b/bigchaindb/backend/mongodb/query.py
index 39d99d4a..3d989a3d 100644
--- a/bigchaindb/backend/mongodb/query.py
+++ b/bigchaindb/backend/mongodb/query.py
@@ -327,3 +327,28 @@ def get_unvoted_blocks(conn, node_pubkey):
                 'votes': False, '_id': False
             }}
         ]))
+
+
+@register_query(MongoDBConnection)
+def text_search(conn, search, language='english', case_sensitive=False,
+                diacritic_sensitive=False, text_score=False, limit=0):
+    cursor = conn.run(
+        conn.collection('assets')
+        .find({'$text': {
+                '$search': search,
+                '$language': language,
+                '$caseSensitive': case_sensitive,
+                '$diacriticSensitive': diacritic_sensitive}},
+              {'score': {'$meta': 'textScore'}, '_id': False})
+        .sort([('score', {'$meta': 'textScore'})])
+        .limit(limit))
+
+    if text_score:
+        return cursor
+    else:
+        return (_remove_text_score(asset) for asset in cursor)
+
+
+def _remove_text_score(asset):
+    asset.pop('score', None)
+    return asset
diff --git a/bigchaindb/backend/mongodb/schema.py b/bigchaindb/backend/mongodb/schema.py
index 12b873e0..6c54bfd8 100644
--- a/bigchaindb/backend/mongodb/schema.py
+++ b/bigchaindb/backend/mongodb/schema.py
@@ -2,7 +2,7 @@
 
 import logging
 
-from pymongo import ASCENDING, DESCENDING
+from pymongo import ASCENDING, DESCENDING, TEXT
 
 from bigchaindb import backend
 from bigchaindb.common import exceptions
@@ -113,3 +113,6 @@ def create_assets_secondary_index(conn, dbname):
     conn.conn[dbname]['assets'].create_index('id',
                                              name='asset_id',
                                              unique=True)
+
+    # full text search index
+    conn.conn[dbname]['assets'].create_index([('$**', TEXT)], name='text')
diff --git a/bigchaindb/backend/query.py b/bigchaindb/backend/query.py
index 8245fb3d..705b0306 100644
--- a/bigchaindb/backend/query.py
+++ b/bigchaindb/backend/query.py
@@ -325,3 +325,11 @@ def get_txids_filtered(connection, asset_id, operation=None):
     """
 
     raise NotImplementedError
+
+
+@singledispatch
+def text_search(conn, search, language='english', case_sensitive=False,
+                diacritic_sensitive=False, text_score=False, limit=0):
+    # TODO: docstring
+
+    raise NotImplementedError
diff --git a/tests/backend/mongodb/test_queries.py b/tests/backend/mongodb/test_queries.py
index c43c5fa4..9b2ad1cf 100644
--- a/tests/backend/mongodb/test_queries.py
+++ b/tests/backend/mongodb/test_queries.py
@@ -464,3 +464,87 @@ def test_get_assets():
 
     assert cursor.count() == 2
     assert list(cursor.sort('id', pymongo.ASCENDING)) == assets[::2]
+
+
+def test_text_search():
+    from bigchaindb.backend import connect, query
+    conn = connect()
+
+    # Example data and tests cases taken from the mongodb documentation
+    # https://docs.mongodb.com/manual/reference/operator/query/text/
+    assets = [
+        {'id': 1, 'subject': 'coffee', 'author': 'xyz', 'views': 50},
+        {'id': 2, 'subject': 'Coffee Shopping', 'author': 'efg', 'views': 5},
+        {'id': 3, 'subject': 'Baking a cake', 'author': 'abc', 'views': 90},
+        {'id': 4, 'subject': 'baking', 'author': 'xyz', 'views': 100},
+        {'id': 5, 'subject': 'Café Con Leche', 'author': 'abc', 'views': 200},
+        {'id': 6, 'subject': 'Сырники', 'author': 'jkl', 'views': 80},
+        {'id': 7, 'subject': 'coffee and cream', 'author': 'efg', 'views': 10},
+        {'id': 8, 'subject': 'Cafe con Leche', 'author': 'xyz', 'views': 10}
+    ]
+
+    # insert the assets
+    conn.db.assets.insert_many(deepcopy(assets), ordered=False)
+
+    # test search single word
+    assert list(query.text_search(conn, 'coffee')) == [
+        {'id': 1, 'subject': 'coffee', 'author': 'xyz', 'views': 50},
+        {'id': 2, 'subject': 'Coffee Shopping', 'author': 'efg', 'views': 5},
+        {'id': 7, 'subject': 'coffee and cream', 'author': 'efg', 'views': 10},
+    ]
+
+    # match any of the search terms
+    assert list(query.text_search(conn, 'bake coffee cake')) == [
+        {'author': 'abc', 'id': 3, 'subject': 'Baking a cake', 'views': 90},
+        {'author': 'xyz', 'id': 1, 'subject': 'coffee', 'views': 50},
+        {'author': 'xyz', 'id': 4, 'subject': 'baking', 'views': 100},
+        {'author': 'efg', 'id': 2, 'subject': 'Coffee Shopping', 'views': 5},
+        {'author': 'efg', 'id': 7, 'subject': 'coffee and cream', 'views': 10}
+    ]
+
+    # search for a phrase
+    assert list(query.text_search(conn, '\"coffee shop\"')) == [
+        {'id': 2, 'subject': 'Coffee Shopping', 'author': 'efg', 'views': 5},
+    ]
+
+    # exclude documents that contain a term
+    assert list(query.text_search(conn, 'coffee -shop')) == [
+        {'id': 1, 'subject': 'coffee', 'author': 'xyz', 'views': 50},
+        {'id': 7, 'subject': 'coffee and cream', 'author': 'efg', 'views': 10},
+    ]
+
+    # search different language
+    assert list(query.text_search(conn, 'leche', language='es')) == [
+        {'id': 5, 'subject': 'Café Con Leche', 'author': 'abc', 'views': 200},
+        {'id': 8, 'subject': 'Cafe con Leche', 'author': 'xyz', 'views': 10}
+    ]
+
+    # case and diacritic insensitive search
+    assert list(query.text_search(conn, 'сы́рники CAFÉS')) == [
+        {'id': 6, 'subject': 'Сырники', 'author': 'jkl', 'views': 80},
+        {'id': 5, 'subject': 'Café Con Leche', 'author': 'abc', 'views': 200},
+        {'id': 8, 'subject': 'Cafe con Leche', 'author': 'xyz', 'views': 10}
+    ]
+
+    # case sensitive search
+    assert list(query.text_search(conn, 'Coffee', case_sensitive=True)) == [
+        {'id': 2, 'subject': 'Coffee Shopping', 'author': 'efg', 'views': 5},
+    ]
+
+    # diacritic sensitive search
+    assert list(query.text_search(conn, 'CAFÉ', diacritic_sensitive=True)) == [
+        {'id': 5, 'subject': 'Café Con Leche', 'author': 'abc', 'views': 200},
+    ]
+
+    # return text score
+    assert list(query.text_search(conn, 'coffee', text_score=True)) == [
+        {'id': 1, 'subject': 'coffee', 'author': 'xyz', 'views': 50, 'score': 1.0},
+        {'id': 2, 'subject': 'Coffee Shopping', 'author': 'efg', 'views': 5, 'score': 0.75},
+        {'id': 7, 'subject': 'coffee and cream', 'author': 'efg', 'views': 10, 'score': 0.75},
+    ]
+
+    # limit search result
+    assert list(query.text_search(conn, 'coffee', limit=2)) == [
+        {'id': 1, 'subject': 'coffee', 'author': 'xyz', 'views': 50},
+        {'id': 2, 'subject': 'Coffee Shopping', 'author': 'efg', 'views': 5},
+    ]
diff --git a/tests/backend/mongodb/test_schema.py b/tests/backend/mongodb/test_schema.py
index e3b320bd..e11dbfe8 100644
--- a/tests/backend/mongodb/test_schema.py
+++ b/tests/backend/mongodb/test_schema.py
@@ -33,7 +33,7 @@ def test_init_creates_db_tables_and_indexes():
     assert sorted(indexes) == ['_id_', 'block_and_voter']
 
     indexes = conn.conn[dbname]['assets'].index_information().keys()
-    assert sorted(indexes) == ['_id_', 'asset_id']
+    assert sorted(indexes) == ['_id_', 'asset_id', 'text']
 
 
 def test_init_database_fails_if_db_exists():