Rewrite Whoosh index

We no longer index units by checksum, what can lead to weird consequences with monolingual file formats, where checksum contains only context and not source. Fixes #702 See also issue #800 Signed-off-by: Michal Čihař <michal@cihar.com>

Rewrite Whoosh index
We no longer index units by checksum, what can lead to weird consequences with monolingual file formats, where checksum contains only context and not source. Fixes #702 See also issue #800 Signed-off-by: Michal Čihař <michal@cihar.com>
e1aa292e · Michal Čihař · ec823161 · e1aa292e · e1aa292e · e1aa292e
Commit e1aa292e authored Jan 17, 2016 by Michal Čihař
5 changed files
--- a/docs/admin/upgrade.rst
+++ b/docs/admin/upgrade.rst
@@ -282,6 +282,20 @@ importantly:
 There is now also additional dependency - ``django_compressor``, please install
 it prior to upgrading.
+Upgrade from 2.4 to 2.5
+~~~~~~~~~~~~~~~~~~~~~~~
+Please  adjust your :file:`settings.py` to match several changes in the
+configuration (consult :file:`settings_example.py` for correct values).
+The fulltext index has been changed, so unless you rebuild it, the fulltext
+search will not work. To rebuild it, execute:
+.. code-block:: sh
+    ./manage.py rebuild_index --clean --all
 .. _django-17:
 Upgrading to Django 1.7

--- a/docs/changes.rst
+++ b/docs/changes.rst
@@ -26,6 +26,7 @@ Released on ? 2015.
 * Extended list of options for import_project.
 * Improved targeting for whiteboard messages.
 * Support for automatic translation across projects.
+* Optimized fulltext search index.
 weblate 2.4
 -----------

--- a/weblate/trans/models/__init__.py
+++ b/weblate/trans/models/__init__.py
@@ -37,6 +37,7 @@ from weblate.trans.models.dictionary import Dictionary
 from weblate.trans.models.source import Source
 from weblate.trans.models.advertisement import Advertisement
 from weblate.trans.models.whiteboard import WhiteboardMessage
+from weblate.trans.search import clean_search_unit
 from weblate.trans.signals import (
    vcs_post_push, vcs_post_update, vcs_pre_commit, vcs_post_commit,
    user_pre_delete, translation_post_add,
@@ -156,7 +157,7 @@ def cleanup_deleted(sender, instance, **kwargs):
    '''
    project = instance.translation.subproject.project
    language = instance.translation.language
-    contentsum = instance.translation
+    contentsum = instance.contentsum
    units = Unit.objects.filter(
        translation__language=language,
        translation__subproject__project=project,
@@ -206,6 +207,9 @@ def cleanup_deleted(sender, instance, **kwargs):
            contentsum=contentsum
        ).delete()
+    # Cleanup fulltext index
+    clean_search_unit(instance.pk, language.code)
 @receiver(vcs_post_push)
 def post_push(sender, component, **kwargs):

--- a/weblate/trans/models/unit.py
+++ b/weblate/trans/models/unit.py
@@ -58,11 +58,11 @@ SIMPLE_FILTERS = {
 SEARCH_FILTERS = ('source', 'target', 'context', 'location', 'comment')
-def more_like_queue(checksum, source, top, queue):
+def more_like_queue(pk, source, top, queue):
    """
    Multiprocess wrapper around more_like.
    """
-    result = more_like(checksum, source, top)
+    result = more_like(pk, source, top)
    queue.put(result)
@@ -244,7 +244,7 @@ class UnitManager(models.Manager):
        else:
            lang = self.all()[0].translation.language.code
            return base.filter(
-                checksum__in=fulltext_search(
+                pk__in=fulltext_search(
                    params['q'],
                    lang,
                    params
@@ -255,14 +255,14 @@ class UnitManager(models.Manager):
        """
        Finds units with same source.
        """
-        checksums = fulltext_search(
+        pks = fulltext_search(
            unit.get_source_plurals()[0],
            unit.translation.language.code,
            {'source': True}
        )
        return self.filter(
-            checksum__in=checksums,
+            pk__in=pks,
            translation__language=unit.translation.language,
            translated=True
        ).exclude(
@@ -277,7 +277,7 @@ class UnitManager(models.Manager):
            queue = multiprocessing.Queue()
            proc = multiprocessing.Process(
                target=more_like_queue,
-                args=(unit.checksum, unit.source, top, queue)
+                args=(unit.pk, unit.source, top, queue)
            )
            proc.start()
            proc.join(appsettings.MT_WEBLATE_LIMIT)
@@ -289,7 +289,7 @@ class UnitManager(models.Manager):
            more_results = queue.get()
        else:
-            more_results = more_like(unit.checksum, unit.source, top)
+            more_results = more_like(unit.pk, unit.source, top)
        same_results = fulltext_search(
            unit.get_source_plurals()[0],
@@ -297,10 +297,9 @@ class UnitManager(models.Manager):
            {'source': True}
        )
-        checksums = more_results - same_results
        return self.filter(
-            checksum__in=checksums,
+            pk__in=more_results - same_results,
            translation__language=unit.translation.language,
            translated=True
        ).exclude(

--- a/weblate/trans/search.py
+++ b/weblate/trans/search.py
@@ -23,7 +23,7 @@ Whoosh based full text search.
 '''
 import shutil
-from whoosh.fields import SchemaClass, TEXT, ID
+from whoosh.fields import SchemaClass, TEXT, ID, NUMERIC
 from whoosh.filedb.filestore import FileStorage
 from whoosh.writing import AsyncWriter, BufferedWriter
 from whoosh import qparser
@@ -43,7 +43,7 @@ class TargetSchema(SchemaClass):
    '''
    Fultext index schema for target strings.
    '''
-    checksum = ID(stored=True, unique=True)
+    pk = NUMERIC(stored=True, unique=True)
    target = TEXT()
    comment = TEXT()
@@ -52,7 +52,7 @@ class SourceSchema(SchemaClass):
    '''
    Fultext index schema for source and context strings.
    '''
-    checksum = ID(stored=True, unique=True)
+    pk = NUMERIC(stored=True, unique=True)
    source = TEXT()
    context = TEXT()
    location = TEXT()
@@ -93,7 +93,7 @@ def update_source_unit_index(writer, unit):
    Updates source index for given unit.
    '''
    writer.update_document(
-        checksum=force_text(unit.checksum),
+        pk=unit.pk,
        source=force_text(unit.source),
        context=force_text(unit.context),
        location=force_text(unit.location),
@@ -105,7 +105,7 @@ def update_target_unit_index(writer, unit):
    Updates target index for given unit.
    '''
    writer.update_document(
-        checksum=force_text(unit.checksum),
+        pk=unit.pk,
        target=force_text(unit.target),
        comment=force_text(unit.comment),
    )
@@ -125,6 +125,10 @@ def get_source_index():
    index = STORAGE.open_index('source')
    if 'location' not in index.schema:
        index.add_field('location', TEXT)
+    if 'pk' not in index.schema:
+        index.add_field('pk', NUMERIC)
+    if 'checksum' in index.schema:
+        index.remove_field('checksum')
    return index
@@ -143,6 +147,10 @@ def get_target_index(lang):
    index = STORAGE.open_index(name)
    if 'comment' not in index.schema:
        index.add_field('comment', TEXT)
+    if 'pk' not in index.schema:
+        index.add_field('pk', NUMERIC)
+    if 'checksum' in index.schema:
+        index.remove_field('checksum')
    return index
@@ -228,14 +236,14 @@ def base_search(searcher, field, schema, query):
    '''
    parser = qparser.QueryParser(field, schema)
    parsed = parser.parse(query)
-    return [result['checksum'] for result in searcher.search(parsed)]
+    return [result['pk'] for result in searcher.search(parsed)]
 def fulltext_search(query, lang, params):
    '''
-    Performs fulltext search in given areas, returns set of checksums.
+    Performs fulltext search in given areas, returns set of primary keys.
    '''
-    checksums = set()
+    pks = set()
    search = {
        'source': False,
@@ -251,7 +259,7 @@ def fulltext_search(query, lang, params):
        with index.searcher() as searcher:
            for param in ('source', 'context', 'location'):
                if search[param]:
-                    checksums.update(
+                    pks.update(
                        base_search(searcher, param, SourceSchema(), query)
                    )
@@ -260,23 +268,31 @@ def fulltext_search(query, lang, params):
        with index.searcher() as searcher:
            for param in ('target', 'comment'):
                if search[param]:
-                    checksums.update(
+                    pks.update(
                        base_search(searcher, param, TargetSchema(), query)
                    )
-    return checksums
+    return pks
-def more_like(checksum, source, top=5):
+def more_like(pk, source, top=5):
    '''
    Finds similar units.
    '''
    index = get_source_index()
    with index.searcher() as searcher:
-        docnum = searcher.document_number(checksum=checksum)
+        docnum = searcher.document_number(pk=pk)
        if docnum is None:
            return set()
        results = searcher.more_like(docnum, 'source', source, top)
-        return set([result['checksum'] for result in results])
+        return set([result['pk'] for result in results])
+def clean_search_unit(pk, lang):
+    """Cleanups search index on unit deletion."""
+    index = get_target_index(lang)
+    index.writer().delete_by_term('pk', pk)
+    index = get_source_index()
+    index.writer().delete_by_term('pk', pk)