Commit e1aa292e authored by Michal Čihař's avatar Michal Čihař

Rewrite Whoosh index

We no longer index units by checksum, what can lead to weird
consequences with monolingual file formats, where checksum contains only
context and not source.

Fixes #702

See also issue #800
Signed-off-by: default avatarMichal Čihař <michal@cihar.com>
parent ec823161
......@@ -282,6 +282,20 @@ importantly:
There is now also additional dependency - ``django_compressor``, please install
it prior to upgrading.
Upgrade from 2.4 to 2.5
~~~~~~~~~~~~~~~~~~~~~~~
Please adjust your :file:`settings.py` to match several changes in the
configuration (consult :file:`settings_example.py` for correct values).
The fulltext index has been changed, so unless you rebuild it, the fulltext
search will not work. To rebuild it, execute:
.. code-block:: sh
./manage.py rebuild_index --clean --all
.. _django-17:
Upgrading to Django 1.7
......
......@@ -26,6 +26,7 @@ Released on ? 2015.
* Extended list of options for import_project.
* Improved targeting for whiteboard messages.
* Support for automatic translation across projects.
* Optimized fulltext search index.
weblate 2.4
-----------
......
......@@ -37,6 +37,7 @@ from weblate.trans.models.dictionary import Dictionary
from weblate.trans.models.source import Source
from weblate.trans.models.advertisement import Advertisement
from weblate.trans.models.whiteboard import WhiteboardMessage
from weblate.trans.search import clean_search_unit
from weblate.trans.signals import (
vcs_post_push, vcs_post_update, vcs_pre_commit, vcs_post_commit,
user_pre_delete, translation_post_add,
......@@ -156,7 +157,7 @@ def cleanup_deleted(sender, instance, **kwargs):
'''
project = instance.translation.subproject.project
language = instance.translation.language
contentsum = instance.translation
contentsum = instance.contentsum
units = Unit.objects.filter(
translation__language=language,
translation__subproject__project=project,
......@@ -206,6 +207,9 @@ def cleanup_deleted(sender, instance, **kwargs):
contentsum=contentsum
).delete()
# Cleanup fulltext index
clean_search_unit(instance.pk, language.code)
@receiver(vcs_post_push)
def post_push(sender, component, **kwargs):
......
......@@ -58,11 +58,11 @@ SIMPLE_FILTERS = {
SEARCH_FILTERS = ('source', 'target', 'context', 'location', 'comment')
def more_like_queue(checksum, source, top, queue):
def more_like_queue(pk, source, top, queue):
"""
Multiprocess wrapper around more_like.
"""
result = more_like(checksum, source, top)
result = more_like(pk, source, top)
queue.put(result)
......@@ -244,7 +244,7 @@ class UnitManager(models.Manager):
else:
lang = self.all()[0].translation.language.code
return base.filter(
checksum__in=fulltext_search(
pk__in=fulltext_search(
params['q'],
lang,
params
......@@ -255,14 +255,14 @@ class UnitManager(models.Manager):
"""
Finds units with same source.
"""
checksums = fulltext_search(
pks = fulltext_search(
unit.get_source_plurals()[0],
unit.translation.language.code,
{'source': True}
)
return self.filter(
checksum__in=checksums,
pk__in=pks,
translation__language=unit.translation.language,
translated=True
).exclude(
......@@ -277,7 +277,7 @@ class UnitManager(models.Manager):
queue = multiprocessing.Queue()
proc = multiprocessing.Process(
target=more_like_queue,
args=(unit.checksum, unit.source, top, queue)
args=(unit.pk, unit.source, top, queue)
)
proc.start()
proc.join(appsettings.MT_WEBLATE_LIMIT)
......@@ -289,7 +289,7 @@ class UnitManager(models.Manager):
more_results = queue.get()
else:
more_results = more_like(unit.checksum, unit.source, top)
more_results = more_like(unit.pk, unit.source, top)
same_results = fulltext_search(
unit.get_source_plurals()[0],
......@@ -297,10 +297,9 @@ class UnitManager(models.Manager):
{'source': True}
)
checksums = more_results - same_results
return self.filter(
checksum__in=checksums,
pk__in=more_results - same_results,
translation__language=unit.translation.language,
translated=True
).exclude(
......
......@@ -23,7 +23,7 @@ Whoosh based full text search.
'''
import shutil
from whoosh.fields import SchemaClass, TEXT, ID
from whoosh.fields import SchemaClass, TEXT, ID, NUMERIC
from whoosh.filedb.filestore import FileStorage
from whoosh.writing import AsyncWriter, BufferedWriter
from whoosh import qparser
......@@ -43,7 +43,7 @@ class TargetSchema(SchemaClass):
'''
Fultext index schema for target strings.
'''
checksum = ID(stored=True, unique=True)
pk = NUMERIC(stored=True, unique=True)
target = TEXT()
comment = TEXT()
......@@ -52,7 +52,7 @@ class SourceSchema(SchemaClass):
'''
Fultext index schema for source and context strings.
'''
checksum = ID(stored=True, unique=True)
pk = NUMERIC(stored=True, unique=True)
source = TEXT()
context = TEXT()
location = TEXT()
......@@ -93,7 +93,7 @@ def update_source_unit_index(writer, unit):
Updates source index for given unit.
'''
writer.update_document(
checksum=force_text(unit.checksum),
pk=unit.pk,
source=force_text(unit.source),
context=force_text(unit.context),
location=force_text(unit.location),
......@@ -105,7 +105,7 @@ def update_target_unit_index(writer, unit):
Updates target index for given unit.
'''
writer.update_document(
checksum=force_text(unit.checksum),
pk=unit.pk,
target=force_text(unit.target),
comment=force_text(unit.comment),
)
......@@ -125,6 +125,10 @@ def get_source_index():
index = STORAGE.open_index('source')
if 'location' not in index.schema:
index.add_field('location', TEXT)
if 'pk' not in index.schema:
index.add_field('pk', NUMERIC)
if 'checksum' in index.schema:
index.remove_field('checksum')
return index
......@@ -143,6 +147,10 @@ def get_target_index(lang):
index = STORAGE.open_index(name)
if 'comment' not in index.schema:
index.add_field('comment', TEXT)
if 'pk' not in index.schema:
index.add_field('pk', NUMERIC)
if 'checksum' in index.schema:
index.remove_field('checksum')
return index
......@@ -228,14 +236,14 @@ def base_search(searcher, field, schema, query):
'''
parser = qparser.QueryParser(field, schema)
parsed = parser.parse(query)
return [result['checksum'] for result in searcher.search(parsed)]
return [result['pk'] for result in searcher.search(parsed)]
def fulltext_search(query, lang, params):
'''
Performs fulltext search in given areas, returns set of checksums.
Performs fulltext search in given areas, returns set of primary keys.
'''
checksums = set()
pks = set()
search = {
'source': False,
......@@ -251,7 +259,7 @@ def fulltext_search(query, lang, params):
with index.searcher() as searcher:
for param in ('source', 'context', 'location'):
if search[param]:
checksums.update(
pks.update(
base_search(searcher, param, SourceSchema(), query)
)
......@@ -260,23 +268,31 @@ def fulltext_search(query, lang, params):
with index.searcher() as searcher:
for param in ('target', 'comment'):
if search[param]:
checksums.update(
pks.update(
base_search(searcher, param, TargetSchema(), query)
)
return checksums
return pks
def more_like(checksum, source, top=5):
def more_like(pk, source, top=5):
'''
Finds similar units.
'''
index = get_source_index()
with index.searcher() as searcher:
docnum = searcher.document_number(checksum=checksum)
docnum = searcher.document_number(pk=pk)
if docnum is None:
return set()
results = searcher.more_like(docnum, 'source', source, top)
return set([result['checksum'] for result in results])
return set([result['pk'] for result in results])
def clean_search_unit(pk, lang):
"""Cleanups search index on unit deletion."""
index = get_target_index(lang)
index.writer().delete_by_term('pk', pk)
index = get_source_index()
index.writer().delete_by_term('pk', pk)
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment