Merge remote-tracking branch 'origin/master'

d3f63cc5 · Michal Čihař · 080093c8 · 455aeed5 · d3f63cc5 · 080093c8
Commit d3f63cc5 authored Mar 19, 2012 by Michal Čihař
13 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,4 @@
 *.swp
 repos/
 *.mo
+whoosh-index/
--- a/ftsearch/__init__.py
+++ b/ftsearch/__init__.py
-"""
-
-Django Full-text search
-
-Author: Patrick Carroll <patrick@patrickomatic.com>
-Version: 0.1
-
-"""
-import re
-from django.conf import settings
-from django.core.exceptions import ImproperlyConfigured
-
-from ftsearch.stemming import PorterStemmer
-
-try:
-	getattr(settings, 'SEARCH_STEMMER')
-
-	# make sure it has a callable .stem() method
-	try:
-		settings.SEARCH_STEMMER().stem('foo')
-	except AttributeError:
-		raise ImproperlyConfigured("The supplied stemmer must support a stem() method")
-except AttributeError:
-	settings.SEARCH_STEMMER = PorterStemmer
-
-
-try:
-	getattr(settings, 'SEARCH_WORD_SPLIT_REGEX')
-except AttributeError:
-	settings.SEARCH_WORD_SPLIT_REGEX = re.compile(r'\W*')
-
--- a/ftsearch/management/__init__.py
+++ b/ftsearch/management/__init__.py
--- a/ftsearch/management/commands/__init__.py
+++ b/ftsearch/management/commands/__init__.py
--- a/ftsearch/management/commands/rebuild_index.py
+++ b/ftsearch/management/commands/rebuild_index.py
-from django.core.management.base import BaseCommand, CommandError
-from trans.models import Unit
-from ftsearch.models import WordLocation, Word
-from optparse import make_option
-
-class Command(BaseCommand):
-    help = 'updates index for fulltext search'
-    option_list = BaseCommand.option_list + (
-        make_option('--clean',
-            action='store_true',
-            dest='clean',
-            default=False,
-            help='removes also all words from database'),
-        )
-
-    def handle(self, *args, **options):
-        if options['clean']:
-            Word.objects.all().delete()
-        WordLocation.objects.all().delete()
-        units = Unit.objects.all()
-        for unit in units:
-            Unit.objects.add_to_index(unit)
--- a/ftsearch/models.py
+++ b/ftsearch/models.py
-"""
-Code based on Django Full-text search
-"""
-from django.db import models
-from lang.models import Language
-from trans.models import Unit
-
-class Word(models.Model):
-    word = models.CharField(max_length=255)
-    language = models.ForeignKey(Language, null = True, blank = True)
-
-    def __unicode__(self):
-        return "%s: %s" % (self.language.name, self.word)
-
-    class Meta:
-        unique_together = ('word', 'language')
-
-class WordLocation(models.Model):
-    word = models.ForeignKey(Word)
-    location = models.PositiveIntegerField()
-    unit = models.ForeignKey(Unit)
-
-    def __unicode__(self):
-        return "%s[%d] (%d)" % (self.word, self.location, self.unit.id)
--- a/ftsearch/stemming.py
+++ b/ftsearch/stemming.py
--- a/settings.py
+++ b/settings.py
@@ -140,7 +140,6 @@ INSTALLED_APPS = (
    'trans',
    'lang',
    'accounts',
-    'ftsearch',
    'weblate',
 )

@@ -252,3 +251,6 @@ ENABLE_HOOKS = True

 # Number of nearby messages to show in each direction
 NEARBY_MESSAGES = 5
+
+# Where to put Whoosh index
+WHOOSH_INDEX = os.path.join(WEB_ROOT, 'whoosh-index')
--- a/trans/management/commands/rebuild_index.py
+++ b/trans/management/commands/rebuild_index.py
+from django.core.management.base import BaseCommand, CommandError
+from trans.models import Unit
+from lang.models import Language
+import trans.search
+from optparse import make_option
+
+class Command(BaseCommand):
+    help = 'updates index for fulltext search'
+    option_list = BaseCommand.option_list + (
+        make_option('--clean',
+            action='store_true',
+            dest='clean',
+            default=False,
+            help='removes also all words from database'),
+        )
+
+    def handle(self, *args, **options):
+        languages = Language.objects.all()
+        if options['clean']:
+            trans.search.create_source_index()
+            for lang in languages:
+                trans.search.create_target_index(lang = lang.code)
+
+        with trans.search.get_source_writer(buffered = False) as writer:
+            for unit in Unit.objects.values('checksum', 'source', 'context', 'translation_id').distinct().iterator():
+                Unit.objects.add_to_source_index(
+                    unit['checksum'],
+                    unit['source'],
+                    unit['context'],
+                    unit['translation_id'],
+                    writer)
+
+        for lang in languages:
+            with trans.search.get_target_writer(lang = lang.code, buffered = False) as writer:
+                for unit in Unit.objects.filter(translation__language =
+                    lang).exclude(target = '').values('checksum', 'target', 'translation_id').iterator():
+                    Unit.objects.add_to_target_index(
+                        unit['checksum'],
+                        unit['target'],
+                        unit['translation_id'],
+                        writer)
+
--- a/trans/management/commands/updatechecks.py
+++ b/trans/management/commands/updatechecks.py
 from django.core.management.base import BaseCommand, CommandError
 from trans.models import Unit
-from ftsearch.models import WordLocation, Word
 from optparse import make_option

 class Command(BaseCommand):
@@ -17,7 +16,7 @@ class Command(BaseCommand):

    def handle(self, *args, **options):
        if options['all']:
-            for unit in Unit.objects.all():
+            for unit in Unit.objects.all().iterator():
                unit.check()
        for arg in args:
            parts = arg.split('/')

--- a/trans/managers.py
+++ b/trans/managers.py
@@ -3,8 +3,12 @@ from django.conf import settings

 from lang.models import Language

+from whoosh import qparser
+
 from util import is_plural, split_plural, join_plural, msg_checksum

+import trans.search
+
 IGNORE_WORDS = set([
    'a',
    'an',
@@ -138,107 +142,72 @@ class UnitManager(models.Manager):
        else:
            return self.all()

-    def is_indexed(self, unit):
-        from ftsearch.models import WordLocation
-        return WordLocation.objects.filter(unit = unit).exists()
-
-    def remove_from_index(self, unit):
-        from ftsearch.models import WordLocation
-        return WordLocation.objects.filter(unit = unit).delete()
-
-    def separate_words(self, words):
-        return settings.SEARCH_WORD_SPLIT_REGEX.split(words)
-
-    def get_similar_list(self, words):
-        words = [word.lower() for word in self.separate_words(words)]
-        return [word for word in words if not word in IGNORE_SIMILAR and len(word) > 0]
-
-    def __index_item(self, text, language, unit):
-        from ftsearch.models import WordLocation, Word
-
-        # Split to words
-        p = settings.SEARCH_STEMMER()
-        stemmed_text = [p.stem(s.lower()) for s in self.separate_words(text) if s != '']
-
-        # Store words in database
-        for i, word in enumerate(stemmed_text):
-            if word in IGNORE_WORDS:
-                continue
-
-            wordobj, created = Word.objects.get_or_create(
-                word = word,
-                language = language
-            )
-            WordLocation.objects.create(
-                unit = unit,
-                word = wordobj,
-                location = i
-            )
-
-    def add_to_index(self, unit):
-        from ftsearch.models import WordLocation
-
-        # Remove if it is already indexed
-        if self.is_indexed(unit):
-            self.remove_from_index(unit)
-
-        # Index source
-        self.__index_item('\n'.join(unit.get_source_plurals()), Language.objects.get(code = 'en'), unit)
-        # Index translation
-        self.__index_item('\n'.join(unit.get_target_plurals()), unit.translation.language, unit)
-        # Index context
-        if unit.context != '':
-            self.__index_item(unit.context, None, unit)
-
-    def __get_match_rows(self, query, language):
-        from ftsearch.models import Word
-        # Grab relevant words
-        word_objects = Word.objects.filter(word__in = query, language = language)
-
-        field_list = 'w0.unit_id'
-        table_list = ''
-        clause_list = ''
-
-        table_number = 0
-
-        for word in word_objects:
-
-            if table_number > 0:
-                table_list += ', '
-                clause_list += ' and w%d.unit_id = w%d.unit_id and ' \
-                               % (table_number - 1, table_number)
-
-            table_list += 'ftsearch_wordlocation w%d' % table_number
-            clause_list += 'w%d.word_id=%d' % (table_number, word.id)
-
-            table_number += 1
-
-        if not table_list or not clause_list:
-            return []
-
-        cur = connection.cursor()
-        cur.execute('select %s from %s where %s' \
-                % (field_list, table_list, clause_list))
-
-        rows = cur.fetchall()
-
-        return [row[0] for row in rows]
-
-    def search(self, query, language):
-        from trans.models import Unit
-        if isinstance(query, str) or isinstance(query, unicode):
-            # split the string into a list of search terms
-            query = self.separate_words(query)
-        elif not isinstance(query, list) and  not isinstance(query, tuple):
-            raise TypeError("search must be called with a string or a list")
-
-        p = settings.SEARCH_STEMMER()
-        # lowercase and stem each word
-        stemmed_query = [p.stem(s.lower()) for s in query if s != '']
-
-        # get a row from the db for each matching word
-        rows = self.__get_match_rows(stemmed_query, language)
-        if rows == []:
-            return self.none()
-
-        return self.filter(pk__in = rows)
+    def add_to_source_index(self, checksum, source, context, translation, writer):
+        writer.update_document(
+            checksum = checksum,
+            source = source,
+            context = context,
+            translation = translation,
+        )
+
+    def add_to_target_index(self, checksum, target, translation, writer):
+        writer.update_document(
+            checksum = checksum,
+            target = target,
+            translation = translation,
+        )
+
+    def add_to_index(self, unit, writer_target = None, writer_source = None):
+        if writer_target is None:
+            writer_target = trans.search.get_target_writer(unit.target.language.code)
+        if writer_source is None:
+            writer_source = trans.search.get_source_writer()
+
+        self.add_to_source_index(
+            unit.checksum,
+            unit.source,
+            unit.context,
+            unit.translation_id,
+            writer_source)
+        self.add_to_target_index(
+            unit.checksum,
+            unit.target,
+            unit.translation_id,
+            writer_target)
+
+    def search(self, query, source = True, context = True, translation = True):
+        ret = []
+        sample = self.all()[0]
+        if source or context:
+            with trans.search.get_source_searcher() as searcher:
+                if source:
+                    qp = qparser.QueryParser('source', trans.search.SourceSchema())
+                    q = qp.parse(query)
+                    for doc in searcher.docs_for_query(q):
+                        ret.append(searcher.stored_fields(doc)['checksum'])
+                if context:
+                    qp = qparser.QueryParser('context', trans.search.SourceSchema())
+                    q = qp.parse(query)
+                    for doc in searcher.docs_for_query(q):
+                        ret.append(searcher.stored_fields(doc)['checksum'])
+
+        if translation:
+            with trans.search.get_target_searcher(sample.translation.language.code) as searcher:
+                qp = qparser.QueryParser('target', trans.search.TargetSchema())
+                q = qp.parse(query)
+                for doc in searcher.docs_for_query(q):
+                    ret.append(searcher.stored_fields(doc)['checksum'])
+
+        return self.filter(checksum__in = ret)
+
+    def similar(self, unit):
+        ret = []
+        with trans.search.get_source_searcher() as searcher:
+            doc = searcher.document_number(checksum = unit.checksum)
+            mlt = searcher.more_like(doc, 'source', unit.source)
+            for m in mlt:
+                ret.append(m['checksum'])
+        return self.filter(
+                    translation__subproject__project = unit.translation.subproject.project,
+                    translation__language = unit.translation.language,
+                    checksum__in = ret).exclude(id = unit.id)
--- a/trans/search.py
+++ b/trans/search.py
+'''
+Whoosh based full text search.
+'''
+
+import whoosh
+import os
+from whoosh.fields import SchemaClass, TEXT, ID, NUMERIC
+from django.db.models.signals import post_syncdb
+from django.conf import settings
+from whoosh import index
+from whoosh.writing import BufferedWriter
+
+class TargetSchema(SchemaClass):
+    checksum = ID(stored = True, unique = True)
+    target = TEXT
+    translation = NUMERIC
+
+class SourceSchema(SchemaClass):
+    checksum = ID(stored = True, unique = True)
+    source = TEXT
+    context = TEXT
+    translation = NUMERIC
+
+def create_source_index():
+    return index.create_in(
+        settings.WHOOSH_INDEX,
+        schema = SourceSchema,
+        indexname = 'source'
+    )
+
+def create_target_index(lang):
+    return index.create_in(
+        settings.WHOOSH_INDEX,
+        schema = TargetSchema,
+        indexname = 'target-%s' % lang
+    )
+
+def create_index(sender=None, **kwargs):
+    if not os.path.exists(settings.WHOOSH_INDEX):
+        os.mkdir(settings.WHOOSH_INDEX)
+        create_source_index()
+
+post_syncdb.connect(create_index)
+
+def get_source_index():
+    if not hasattr(get_source_index, 'ix_source'):
+        get_source_index.ix_source = index.open_dir(
+            settings.WHOOSH_INDEX,
+            indexname = 'source'
+        )
+    return get_source_index.ix_source
+
+def get_target_index(lang):
+    if not hasattr(get_target_index, 'ix_target'):
+        get_target_index.ix_target = {}
+    if not lang in get_target_index.ix_target:
+        try:
+            get_target_index.ix_target[lang] = index.open_dir(
+                settings.WHOOSH_INDEX,
+                indexname = 'target-%s' % lang
+            )
+        except whoosh.index.EmptyIndexError:
+            get_target_index.ix_target[lang] = create_target_index(lang)
+    return get_target_index.ix_target[lang]
+
+def get_source_writer(buffered = True):
+    if not buffered:
+        return get_source_index().writer()
+    if not hasattr(get_source_writer, 'source_writer'):
+        get_source_writer.source_writer = BufferedWriter(get_source_index())
+    return get_source_writer.source_writer
+
+def get_target_writer(lang, buffered = True):
+    if not buffered:
+        return get_target_index(lang).writer()
+    if not hasattr(get_target_writer, 'target_writer'):
+        get_target_writer.target_writer = {}
+    if not lang in get_target_writer.target_writer:
+        get_target_writer.target_writer[lang] = BufferedWriter(get_target_index(lang))
+    return get_target_writer.target_writer[lang]
+
+def get_source_searcher():
+    return get_source_writer().searcher()
+
+def get_target_searcher(lang):
+    return get_target_writer(lang).searcher()
--- a/trans/views.py
+++ b/trans/views.py
@@ -302,13 +302,7 @@ def translate(request, project, subproject, lang):
                    query |= Q(context = search_query)
                units = units.filter(query)
            else:
-                units = obj.unit_set.none()
-                if search_source:
-                    units |= obj.unit_set.search(search_query, Language.objects.get(code = 'en'))
-                if search_target:
-                    units |= obj.unit_set.search(search_query, obj.language)
-                if search_context:
-                    units |= obj.unit_set.search(search_query, None)
+                units = obj.unit_set.search(search_query, search_source, search_context, search_target)
            if direction == 'stay':
                units = units.filter(position = pos)
            elif direction == 'back':
@@ -388,27 +382,8 @@ def get_string(request, checksum):

 def get_similar(request, unit_id):
    unit = get_object_or_404(Unit, pk = int(unit_id))
-    words = Unit.objects.get_similar_list(unit.get_source_plurals()[0])
-    similar = Unit.objects.none()
-    cnt = min(len(words), 5)
-    # Try to find 10 similar string, remove up to 5 words
-    while similar.count() < 10 and cnt > 0 and len(words) - cnt < 5:
-        for search in itertools.combinations(words, cnt):
-            similar |= Unit.objects.search(search, Language.objects.get(code = 'en')).filter(
-                translation__subproject__project = unit.translation.subproject.project,
-                translation__language = unit.translation.language).exclude(id = unit.id)
-        cnt -= 1
-
-    # distinct('target') works with Django 1.4 so let's emulate that
-    # based on presumption we won't get too many results
-    targets = {}
-    res = []
-    for s in similar:
-        if s.target in targets:
-            continue
-        targets[s.target] = 1
-        res.append(s)
-    similar = res
+
+    similar = Unit.objects.similar(unit)

    return render_to_response('similar.html', RequestContext(request, {
        'similar': similar,