Commit e1aa292e authored by Michal Čihař's avatar Michal Čihař

Rewrite Whoosh index

We no longer index units by checksum, what can lead to weird
consequences with monolingual file formats, where checksum contains only
context and not source.

Fixes #702

See also issue #800
Signed-off-by: default avatarMichal Čihař <michal@cihar.com>
parent ec823161
...@@ -282,6 +282,20 @@ importantly: ...@@ -282,6 +282,20 @@ importantly:
There is now also additional dependency - ``django_compressor``, please install There is now also additional dependency - ``django_compressor``, please install
it prior to upgrading. it prior to upgrading.
Upgrade from 2.4 to 2.5
~~~~~~~~~~~~~~~~~~~~~~~
Please adjust your :file:`settings.py` to match several changes in the
configuration (consult :file:`settings_example.py` for correct values).
The fulltext index has been changed, so unless you rebuild it, the fulltext
search will not work. To rebuild it, execute:
.. code-block:: sh
./manage.py rebuild_index --clean --all
.. _django-17: .. _django-17:
Upgrading to Django 1.7 Upgrading to Django 1.7
......
...@@ -26,6 +26,7 @@ Released on ? 2015. ...@@ -26,6 +26,7 @@ Released on ? 2015.
* Extended list of options for import_project. * Extended list of options for import_project.
* Improved targeting for whiteboard messages. * Improved targeting for whiteboard messages.
* Support for automatic translation across projects. * Support for automatic translation across projects.
* Optimized fulltext search index.
weblate 2.4 weblate 2.4
----------- -----------
......
...@@ -37,6 +37,7 @@ from weblate.trans.models.dictionary import Dictionary ...@@ -37,6 +37,7 @@ from weblate.trans.models.dictionary import Dictionary
from weblate.trans.models.source import Source from weblate.trans.models.source import Source
from weblate.trans.models.advertisement import Advertisement from weblate.trans.models.advertisement import Advertisement
from weblate.trans.models.whiteboard import WhiteboardMessage from weblate.trans.models.whiteboard import WhiteboardMessage
from weblate.trans.search import clean_search_unit
from weblate.trans.signals import ( from weblate.trans.signals import (
vcs_post_push, vcs_post_update, vcs_pre_commit, vcs_post_commit, vcs_post_push, vcs_post_update, vcs_pre_commit, vcs_post_commit,
user_pre_delete, translation_post_add, user_pre_delete, translation_post_add,
...@@ -156,7 +157,7 @@ def cleanup_deleted(sender, instance, **kwargs): ...@@ -156,7 +157,7 @@ def cleanup_deleted(sender, instance, **kwargs):
''' '''
project = instance.translation.subproject.project project = instance.translation.subproject.project
language = instance.translation.language language = instance.translation.language
contentsum = instance.translation contentsum = instance.contentsum
units = Unit.objects.filter( units = Unit.objects.filter(
translation__language=language, translation__language=language,
translation__subproject__project=project, translation__subproject__project=project,
...@@ -206,6 +207,9 @@ def cleanup_deleted(sender, instance, **kwargs): ...@@ -206,6 +207,9 @@ def cleanup_deleted(sender, instance, **kwargs):
contentsum=contentsum contentsum=contentsum
).delete() ).delete()
# Cleanup fulltext index
clean_search_unit(instance.pk, language.code)
@receiver(vcs_post_push) @receiver(vcs_post_push)
def post_push(sender, component, **kwargs): def post_push(sender, component, **kwargs):
......
...@@ -58,11 +58,11 @@ SIMPLE_FILTERS = { ...@@ -58,11 +58,11 @@ SIMPLE_FILTERS = {
SEARCH_FILTERS = ('source', 'target', 'context', 'location', 'comment') SEARCH_FILTERS = ('source', 'target', 'context', 'location', 'comment')
def more_like_queue(checksum, source, top, queue): def more_like_queue(pk, source, top, queue):
""" """
Multiprocess wrapper around more_like. Multiprocess wrapper around more_like.
""" """
result = more_like(checksum, source, top) result = more_like(pk, source, top)
queue.put(result) queue.put(result)
...@@ -244,7 +244,7 @@ class UnitManager(models.Manager): ...@@ -244,7 +244,7 @@ class UnitManager(models.Manager):
else: else:
lang = self.all()[0].translation.language.code lang = self.all()[0].translation.language.code
return base.filter( return base.filter(
checksum__in=fulltext_search( pk__in=fulltext_search(
params['q'], params['q'],
lang, lang,
params params
...@@ -255,14 +255,14 @@ class UnitManager(models.Manager): ...@@ -255,14 +255,14 @@ class UnitManager(models.Manager):
""" """
Finds units with same source. Finds units with same source.
""" """
checksums = fulltext_search( pks = fulltext_search(
unit.get_source_plurals()[0], unit.get_source_plurals()[0],
unit.translation.language.code, unit.translation.language.code,
{'source': True} {'source': True}
) )
return self.filter( return self.filter(
checksum__in=checksums, pk__in=pks,
translation__language=unit.translation.language, translation__language=unit.translation.language,
translated=True translated=True
).exclude( ).exclude(
...@@ -277,7 +277,7 @@ class UnitManager(models.Manager): ...@@ -277,7 +277,7 @@ class UnitManager(models.Manager):
queue = multiprocessing.Queue() queue = multiprocessing.Queue()
proc = multiprocessing.Process( proc = multiprocessing.Process(
target=more_like_queue, target=more_like_queue,
args=(unit.checksum, unit.source, top, queue) args=(unit.pk, unit.source, top, queue)
) )
proc.start() proc.start()
proc.join(appsettings.MT_WEBLATE_LIMIT) proc.join(appsettings.MT_WEBLATE_LIMIT)
...@@ -289,7 +289,7 @@ class UnitManager(models.Manager): ...@@ -289,7 +289,7 @@ class UnitManager(models.Manager):
more_results = queue.get() more_results = queue.get()
else: else:
more_results = more_like(unit.checksum, unit.source, top) more_results = more_like(unit.pk, unit.source, top)
same_results = fulltext_search( same_results = fulltext_search(
unit.get_source_plurals()[0], unit.get_source_plurals()[0],
...@@ -297,10 +297,9 @@ class UnitManager(models.Manager): ...@@ -297,10 +297,9 @@ class UnitManager(models.Manager):
{'source': True} {'source': True}
) )
checksums = more_results - same_results
return self.filter( return self.filter(
checksum__in=checksums, pk__in=more_results - same_results,
translation__language=unit.translation.language, translation__language=unit.translation.language,
translated=True translated=True
).exclude( ).exclude(
......
...@@ -23,7 +23,7 @@ Whoosh based full text search. ...@@ -23,7 +23,7 @@ Whoosh based full text search.
''' '''
import shutil import shutil
from whoosh.fields import SchemaClass, TEXT, ID from whoosh.fields import SchemaClass, TEXT, ID, NUMERIC
from whoosh.filedb.filestore import FileStorage from whoosh.filedb.filestore import FileStorage
from whoosh.writing import AsyncWriter, BufferedWriter from whoosh.writing import AsyncWriter, BufferedWriter
from whoosh import qparser from whoosh import qparser
...@@ -43,7 +43,7 @@ class TargetSchema(SchemaClass): ...@@ -43,7 +43,7 @@ class TargetSchema(SchemaClass):
''' '''
Fultext index schema for target strings. Fultext index schema for target strings.
''' '''
checksum = ID(stored=True, unique=True) pk = NUMERIC(stored=True, unique=True)
target = TEXT() target = TEXT()
comment = TEXT() comment = TEXT()
...@@ -52,7 +52,7 @@ class SourceSchema(SchemaClass): ...@@ -52,7 +52,7 @@ class SourceSchema(SchemaClass):
''' '''
Fultext index schema for source and context strings. Fultext index schema for source and context strings.
''' '''
checksum = ID(stored=True, unique=True) pk = NUMERIC(stored=True, unique=True)
source = TEXT() source = TEXT()
context = TEXT() context = TEXT()
location = TEXT() location = TEXT()
...@@ -93,7 +93,7 @@ def update_source_unit_index(writer, unit): ...@@ -93,7 +93,7 @@ def update_source_unit_index(writer, unit):
Updates source index for given unit. Updates source index for given unit.
''' '''
writer.update_document( writer.update_document(
checksum=force_text(unit.checksum), pk=unit.pk,
source=force_text(unit.source), source=force_text(unit.source),
context=force_text(unit.context), context=force_text(unit.context),
location=force_text(unit.location), location=force_text(unit.location),
...@@ -105,7 +105,7 @@ def update_target_unit_index(writer, unit): ...@@ -105,7 +105,7 @@ def update_target_unit_index(writer, unit):
Updates target index for given unit. Updates target index for given unit.
''' '''
writer.update_document( writer.update_document(
checksum=force_text(unit.checksum), pk=unit.pk,
target=force_text(unit.target), target=force_text(unit.target),
comment=force_text(unit.comment), comment=force_text(unit.comment),
) )
...@@ -125,6 +125,10 @@ def get_source_index(): ...@@ -125,6 +125,10 @@ def get_source_index():
index = STORAGE.open_index('source') index = STORAGE.open_index('source')
if 'location' not in index.schema: if 'location' not in index.schema:
index.add_field('location', TEXT) index.add_field('location', TEXT)
if 'pk' not in index.schema:
index.add_field('pk', NUMERIC)
if 'checksum' in index.schema:
index.remove_field('checksum')
return index return index
...@@ -143,6 +147,10 @@ def get_target_index(lang): ...@@ -143,6 +147,10 @@ def get_target_index(lang):
index = STORAGE.open_index(name) index = STORAGE.open_index(name)
if 'comment' not in index.schema: if 'comment' not in index.schema:
index.add_field('comment', TEXT) index.add_field('comment', TEXT)
if 'pk' not in index.schema:
index.add_field('pk', NUMERIC)
if 'checksum' in index.schema:
index.remove_field('checksum')
return index return index
...@@ -228,14 +236,14 @@ def base_search(searcher, field, schema, query): ...@@ -228,14 +236,14 @@ def base_search(searcher, field, schema, query):
''' '''
parser = qparser.QueryParser(field, schema) parser = qparser.QueryParser(field, schema)
parsed = parser.parse(query) parsed = parser.parse(query)
return [result['checksum'] for result in searcher.search(parsed)] return [result['pk'] for result in searcher.search(parsed)]
def fulltext_search(query, lang, params): def fulltext_search(query, lang, params):
''' '''
Performs fulltext search in given areas, returns set of checksums. Performs fulltext search in given areas, returns set of primary keys.
''' '''
checksums = set() pks = set()
search = { search = {
'source': False, 'source': False,
...@@ -251,7 +259,7 @@ def fulltext_search(query, lang, params): ...@@ -251,7 +259,7 @@ def fulltext_search(query, lang, params):
with index.searcher() as searcher: with index.searcher() as searcher:
for param in ('source', 'context', 'location'): for param in ('source', 'context', 'location'):
if search[param]: if search[param]:
checksums.update( pks.update(
base_search(searcher, param, SourceSchema(), query) base_search(searcher, param, SourceSchema(), query)
) )
...@@ -260,23 +268,31 @@ def fulltext_search(query, lang, params): ...@@ -260,23 +268,31 @@ def fulltext_search(query, lang, params):
with index.searcher() as searcher: with index.searcher() as searcher:
for param in ('target', 'comment'): for param in ('target', 'comment'):
if search[param]: if search[param]:
checksums.update( pks.update(
base_search(searcher, param, TargetSchema(), query) base_search(searcher, param, TargetSchema(), query)
) )
return checksums return pks
def more_like(checksum, source, top=5): def more_like(pk, source, top=5):
''' '''
Finds similar units. Finds similar units.
''' '''
index = get_source_index() index = get_source_index()
with index.searcher() as searcher: with index.searcher() as searcher:
docnum = searcher.document_number(checksum=checksum) docnum = searcher.document_number(pk=pk)
if docnum is None: if docnum is None:
return set() return set()
results = searcher.more_like(docnum, 'source', source, top) results = searcher.more_like(docnum, 'source', source, top)
return set([result['checksum'] for result in results]) return set([result['pk'] for result in results])
def clean_search_unit(pk, lang):
"""Cleanups search index on unit deletion."""
index = get_target_index(lang)
index.writer().delete_by_term('pk', pk)
index = get_source_index()
index.writer().delete_by_term('pk', pk)
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment