Merge remote-tracking branch 'origin/master'

d3f63cc5 · Michal Čihař · 080093c8 · 455aeed5 · d3f63cc5 · 080093c8
Commit d3f63cc5 authored Mar 19, 2012 by Michal Čihař
13 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,4 @@
 *.swp
 repos/
 *.mo
+whoosh-index/
--- a/ftsearch/__init__.py
+++ b/ftsearch/__init__.py
-"""
-
-Django Full-text search
-
-Author: Patrick Carroll <patrick@patrickomatic.com>
-Version: 0.1
-
-"""
-import re
-from django.conf import settings
-from django.core.exceptions import ImproperlyConfigured
-
-from ftsearch.stemming import PorterStemmer
-
-try:
-	getattr(settings, 'SEARCH_STEMMER')
-
-	# make sure it has a callable .stem() method
-	try:
-		settings.SEARCH_STEMMER().stem('foo')
-	except AttributeError:
-		raise ImproperlyConfigured("The supplied stemmer must support a stem() method")
-except AttributeError:
-	settings.SEARCH_STEMMER = PorterStemmer
-
-
-try:
-	getattr(settings, 'SEARCH_WORD_SPLIT_REGEX')
-except AttributeError:
-	settings.SEARCH_WORD_SPLIT_REGEX = re.compile(r'\W*')
-
--- a/ftsearch/management/__init__.py
+++ b/ftsearch/management/__init__.py
--- a/ftsearch/management/commands/__init__.py
+++ b/ftsearch/management/commands/__init__.py
--- a/ftsearch/management/commands/rebuild_index.py
+++ b/ftsearch/management/commands/rebuild_index.py
-from django.core.management.base import BaseCommand, CommandError
-from trans.models import Unit
-from ftsearch.models import WordLocation, Word
-from optparse import make_option
-
-class Command(BaseCommand):
-    help = 'updates index for fulltext search'
-    option_list = BaseCommand.option_list + (
-        make_option('--clean',
-            action='store_true',
-            dest='clean',
-            default=False,
-            help='removes also all words from database'),
-        )
-
-    def handle(self, *args, **options):
-        if options['clean']:
-            Word.objects.all().delete()
-        WordLocation.objects.all().delete()
-        units = Unit.objects.all()
-        for unit in units:
-            Unit.objects.add_to_index(unit)
--- a/ftsearch/models.py
+++ b/ftsearch/models.py
-"""
-Code based on Django Full-text search
-"""
-from django.db import models
-from lang.models import Language
-from trans.models import Unit
-
-class Word(models.Model):
-    word = models.CharField(max_length=255)
-    language = models.ForeignKey(Language, null = True, blank = True)
-
-    def __unicode__(self):
-        return "%s: %s" % (self.language.name, self.word)
-
-    class Meta:
-        unique_together = ('word', 'language')
-
-class WordLocation(models.Model):
-    word = models.ForeignKey(Word)
-    location = models.PositiveIntegerField()
-    unit = models.ForeignKey(Unit)
-
-    def __unicode__(self):
-        return "%s[%d] (%d)" % (self.word, self.location, self.unit.id)
--- a/ftsearch/stemming.py
+++ b/ftsearch/stemming.py
-"""Porter Stemming Algorithm
-This is the Porter stemming algorithm, ported to Python from the
-version coded up in ANSI C by the author. It may be be regarded
-as canonical, in that it follows the algorithm presented in
-
-Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
-no. 3, pp 130-137,
-
-only differing from it at the points maked --DEPARTURE-- below.
-
-See also http://www.tartarus.org/~martin/PorterStemmer
-
-The algorithm as described in the paper could be exactly replicated
-by adjusting the points of DEPARTURE, but this is barely necessary,
-because (a) the points of DEPARTURE are definitely improvements, and
-(b) no encoding of the Porter stemmer I have seen is anything like
-as exact as this version, even with the points of DEPARTURE!
-
-Vivake Gupta (v@nano.com)
-
-Release 1: January 2001
-
-Further adjustments by Santiago Bruno (bananabruno@gmail.com)
-to allow word input not restricted to one word per line, leading
-to:
-
-release 2: July 2008
-"""
-
-
-class PorterStemmer:
-	def __init__(self):
-		"""The main part of the stemming algorithm starts here.
-		b is a buffer holding a word to be stemmed. The letters are in b[k0],
-		b[k0+1] ... ending at b[k]. In fact k0 = 0 in this demo program. k is
-		readjusted downwards as the stemming progresses. Zero termination is
-		not in fact used in the algorithm.
-
-		Note that only lower case sequences are stemmed. Forcing to lower case
-		should be done before stem(...) is called.
-		"""
-
-		self.b = ""  # buffer for word to be stemmed
-		self.k = 0
-		self.k0 = 0
-		self.j = 0   # j is a general offset into the string
-
-
-	def cons(self, i):
-		"""cons(i) is TRUE <=> b[i] is a consonant."""
-		if self.b[i] == 'a' or self.b[i] == 'e' or self.b[i] == 'i' or self.b[i] == 'o' or self.b[i] == 'u':
-			return 0
-		if self.b[i] == 'y':
-			if i == self.k0:
-				return 1
-			else:
-				return (not self.cons(i - 1))
-		return 1
-
-
-	def m(self):
-		"""m() measures the number of consonant sequences between k0 and j.
-		if c is a consonant sequence and v a vowel sequence, and <..>
-		indicates arbitrary presence,
-
-		   <c><v>	   gives 0
-		   <c>vc<v>	 gives 1
-		   <c>vcvc<v>   gives 2
-		   <c>vcvcvc<v> gives 3
-		   ....
-		"""
-		n = 0
-		i = self.k0
-		while 1:
-			if i > self.j:
-				return n
-			if not self.cons(i):
-				break
-			i = i + 1
-		i = i + 1
-		while 1:
-			while 1:
-				if i > self.j:
-					return n
-				if self.cons(i):
-					break
-				i = i + 1
-			i = i + 1
-			n = n + 1
-			while 1:
-				if i > self.j:
-					return n
-				if not self.cons(i):
-					break
-				i = i + 1
-			i = i + 1
-
-
-	def vowelinstem(self):
-		"""vowelinstem() is TRUE <=> k0,...j contains a vowel"""
-		for i in range(self.k0, self.j + 1):
-			if not self.cons(i):
-				return 1
-		return 0
-
-
-	def doublec(self, j):
-		"""doublec(j) is TRUE <=> j,(j-1) contain a double consonant."""
-		if j < (self.k0 + 1):
-			return 0
-		if (self.b[j] != self.b[j-1]):
-			return 0
-		return self.cons(j)
-
-
-	def cvc(self, i):
-		"""cvc(i) is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant
-		and also if the second c is not w,x or y. this is used when trying to
-		restore an e at the end of a short  e.g.
-
-		   cav(e), lov(e), hop(e), crim(e), but
-		   snow, box, tray.
-		"""
-		if i < (self.k0 + 2) or not self.cons(i) or self.cons(i-1) or not self.cons(i-2):
-			return 0
-		ch = self.b[i]
-		if ch == 'w' or ch == 'x' or ch == 'y':
-			return 0
-		return 1
-
-
-	def ends(self, s):
-		"""ends(s) is TRUE <=> k0,...k ends with the string s."""
-		length = len(s)
-		if s[length - 1] != self.b[self.k]: # tiny speed-up
-			return 0
-		if length > (self.k - self.k0 + 1):
-			return 0
-		if self.b[self.k-length+1:self.k+1] != s:
-			return 0
-		self.j = self.k - length
-		return 1
-
-
-	def setto(self, s):
-		"""setto(s) sets (j+1),...k to the characters in the string s, readjusting k."""
-		length = len(s)
-		self.b = self.b[:self.j+1] + s + self.b[self.j+length+1:]
-		self.k = self.j + length
-
-
-	def r(self, s):
-		"""r(s) is used further down."""
-		if self.m() > 0:
-			self.setto(s)
-
-
-	def step1ab(self):
-		"""step1ab() gets rid of plurals and -ed or -ing. e.g.
-
-		   caresses  ->  caress
-		   ponies	->  poni
-		   ties	  ->  ti
-		   caress	->  caress
-		   cats	  ->  cat
-
-		   feed	  ->  feed
-		   agreed	->  agree
-		   disabled  ->  disable
-
-		   matting   ->  mat
-		   mating	->  mate
-		   meeting   ->  meet
-		   milling   ->  mill
-		   messing   ->  mess
-
-		   meetings  ->  meet
-		"""
-		if self.b[self.k] == 's':
-			if self.ends("sses"):
-				self.k = self.k - 2
-			elif self.ends("ies"):
-				self.setto("i")
-			elif self.b[self.k - 1] != 's':
-				self.k = self.k - 1
-		if self.ends("eed"):
-			if self.m() > 0:
-				self.k = self.k - 1
-		elif (self.ends("ed") or self.ends("ing")) and self.vowelinstem():
-			self.k = self.j
-			if self.ends("at"):   self.setto("ate")
-			elif self.ends("bl"): self.setto("ble")
-			elif self.ends("iz"): self.setto("ize")
-			elif self.doublec(self.k):
-				self.k = self.k - 1
-				ch = self.b[self.k]
-				if ch == 'l' or ch == 's' or ch == 'z':
-					self.k = self.k + 1
-			elif (self.m() == 1 and self.cvc(self.k)):
-				self.setto("e")
-
-
-	def step1c(self):
-		"""step1c() turns terminal y to i when there is another vowel in the stem."""
-		if (self.ends("y") and self.vowelinstem()):
-			self.b = self.b[:self.k] + 'i' + self.b[self.k+1:]
-
-
-	def step2(self):
-		"""step2() maps double suffices to single ones.
-		so -ization ( = -ize plus -ation) maps to -ize etc. note that the
-		string before the suffix must give m() > 0.
-		"""
-		if self.b[self.k - 1] == 'a':
-			if self.ends("ational"):   self.r("ate")
-			elif self.ends("tional"):  self.r("tion")
-		elif self.b[self.k - 1] == 'c':
-			if self.ends("enci"):	  self.r("ence")
-			elif self.ends("anci"):	self.r("ance")
-		elif self.b[self.k - 1] == 'e':
-			if self.ends("izer"):	  self.r("ize")
-		elif self.b[self.k - 1] == 'l':
-			if self.ends("bli"):	   self.r("ble") # --DEPARTURE--
-			# To match the published algorithm, replace this phrase with
-			#   if self.ends("abli"):	  self.r("able")
-			elif self.ends("alli"):	self.r("al")
-			elif self.ends("entli"):   self.r("ent")
-			elif self.ends("eli"):	 self.r("e")
-			elif self.ends("ousli"):   self.r("ous")
-		elif self.b[self.k - 1] == 'o':
-			if self.ends("ization"):   self.r("ize")
-			elif self.ends("ation"):   self.r("ate")
-			elif self.ends("ator"):	self.r("ate")
-		elif self.b[self.k - 1] == 's':
-			if self.ends("alism"):	 self.r("al")
-			elif self.ends("iveness"): self.r("ive")
-			elif self.ends("fulness"): self.r("ful")
-			elif self.ends("ousness"): self.r("ous")
-		elif self.b[self.k - 1] == 't':
-			if self.ends("aliti"):	 self.r("al")
-			elif self.ends("iviti"):   self.r("ive")
-			elif self.ends("biliti"):  self.r("ble")
-		elif self.b[self.k - 1] == 'g': # --DEPARTURE--
-			if self.ends("logi"):	  self.r("log")
-		# To match the published algorithm, delete this phrase
-
-
-	def step3(self):
-		"""step3() dels with -ic-, -full, -ness etc. similar strategy to step2."""
-		if self.b[self.k] == 'e':
-			if self.ends("icate"):	 self.r("ic")
-			elif self.ends("ative"):   self.r("")
-			elif self.ends("alize"):   self.r("al")
-		elif self.b[self.k] == 'i':
-			if self.ends("iciti"):	 self.r("ic")
-		elif self.b[self.k] == 'l':
-			if self.ends("ical"):	  self.r("ic")
-			elif self.ends("ful"):	 self.r("")
-		elif self.b[self.k] == 's':
-			if self.ends("ness"):	  self.r("")
-
-
-	def step4(self):
-		"""step4() takes off -ant, -ence etc., in context <c>vcvc<v>."""
-		if self.b[self.k - 1] == 'a':
-			if self.ends("al"): pass
-			else: return
-		elif self.b[self.k - 1] == 'c':
-			if self.ends("ance"): pass
-			elif self.ends("ence"): pass
-			else: return
-		elif self.b[self.k - 1] == 'e':
-			if self.ends("er"): pass
-			else: return
-		elif self.b[self.k - 1] == 'i':
-			if self.ends("ic"): pass
-			else: return
-		elif self.b[self.k - 1] == 'l':
-			if self.ends("able"): pass
-			elif self.ends("ible"): pass
-			else: return
-		elif self.b[self.k - 1] == 'n':
-			if self.ends("ant"): pass
-			elif self.ends("ement"): pass
-			elif self.ends("ment"): pass
-			elif self.ends("ent"): pass
-			else: return
-		elif self.b[self.k - 1] == 'o':
-			if self.ends("ion") and (self.b[self.j] == 's' or self.b[self.j] == 't'): pass
-			elif self.ends("ou"): pass
-			# takes care of -ous
-			else: return
-		elif self.b[self.k - 1] == 's':
-			if self.ends("ism"): pass
-			else: return
-		elif self.b[self.k - 1] == 't':
-			if self.ends("ate"): pass
-			elif self.ends("iti"): pass
-			else: return
-		elif self.b[self.k - 1] == 'u':
-			if self.ends("ous"): pass
-			else: return
-		elif self.b[self.k - 1] == 'v':
-			if self.ends("ive"): pass
-			else: return
-		elif self.b[self.k - 1] == 'z':
-			if self.ends("ize"): pass
-			else: return
-		else:
-			return
-		if self.m() > 1:
-			self.k = self.j
-
-
-	def step5(self):
-		"""step5() removes a final -e if m() > 1, and changes -ll to -l if
-		m() > 1.
-		"""
-		self.j = self.k
-		if self.b[self.k] == 'e':
-			a = self.m()
-			if a > 1 or (a == 1 and not self.cvc(self.k-1)):
-				self.k = self.k - 1
-		if self.b[self.k] == 'l' and self.doublec(self.k) and self.m() > 1:
-			self.k = self.k -1
-
-
-	def stem(self, p, i=None, j=None):
-		"""In stem(p,i,j), p is a char pointer, and the string to be stemmed
-		is from p[i] to p[j] inclusive. Typically i is zero and j is the
-		offset to the last character of a string, (p[j+1] == '\0'). The
-		stemmer adjusts the characters p[i] ... p[j] and returns the new
-		end-point of the string, k. Stemming never increases word length, so
-		i <= k <= j. To turn the stemmer into a module, declare 'stem' as
-		extern, and delete the remainder of this file.
-		"""
-		# copy the parameters into statics
-		self.b = p
-		if i: self.k0 = i
-		else: self.k0 = 0
-		if j: self.k = j
-		else: self.k = len(p)-1
-
-		if self.k <= self.k0 + 1:
-			return self.b # --DEPARTURE--
-
-		# With this line, strings of length 1 or 2 don't go through the
-		# stemming process, although no mention is made of this in the
-		# published algorithm. Remove the line to match the published
-		# algorithm.
-
-		self.step1ab()
-		self.step1c()
-		self.step2()
-		self.step3()
-		self.step4()
-		self.step5()
-
-		return self.b[self.k0:self.k+1]
--- a/settings.py
+++ b/settings.py
@@ -140,7 +140,6 @@ INSTALLED_APPS = (
    'trans',
    'lang',
    'accounts',
-    'ftsearch',
    'weblate',
 )

@@ -252,3 +251,6 @@ ENABLE_HOOKS = True

 # Number of nearby messages to show in each direction
 NEARBY_MESSAGES = 5
+
+# Where to put Whoosh index
+WHOOSH_INDEX = os.path.join(WEB_ROOT, 'whoosh-index')
--- a/trans/management/commands/rebuild_index.py
+++ b/trans/management/commands/rebuild_index.py
+from django.core.management.base import BaseCommand, CommandError
+from trans.models import Unit
+from lang.models import Language
+import trans.search
+from optparse import make_option
+
+class Command(BaseCommand):
+    help = 'updates index for fulltext search'
+    option_list = BaseCommand.option_list + (
+        make_option('--clean',
+            action='store_true',
+            dest='clean',
+            default=False,
+            help='removes also all words from database'),
+        )
+
+    def handle(self, *args, **options):
+        languages = Language.objects.all()
+        if options['clean']:
+            trans.search.create_source_index()
+            for lang in languages:
+                trans.search.create_target_index(lang = lang.code)
+
+        with trans.search.get_source_writer(buffered = False) as writer:
+            for unit in Unit.objects.values('checksum', 'source', 'context', 'translation_id').distinct().iterator():
+                Unit.objects.add_to_source_index(
+                    unit['checksum'],
+                    unit['source'],
+                    unit['context'],
+                    unit['translation_id'],
+                    writer)
+
+        for lang in languages:
+            with trans.search.get_target_writer(lang = lang.code, buffered = False) as writer:
+                for unit in Unit.objects.filter(translation__language =
+                    lang).exclude(target = '').values('checksum', 'target', 'translation_id').iterator():
+                    Unit.objects.add_to_target_index(
+                        unit['checksum'],
+                        unit['target'],
+                        unit['translation_id'],
+                        writer)
+
--- a/trans/management/commands/updatechecks.py
+++ b/trans/management/commands/updatechecks.py
 from django.core.management.base import BaseCommand, CommandError
 from trans.models import Unit
-from ftsearch.models import WordLocation, Word
 from optparse import make_option

 class Command(BaseCommand):
@@ -17,7 +16,7 @@ class Command(BaseCommand):

    def handle(self, *args, **options):
        if options['all']:
-            for unit in Unit.objects.all():
+            for unit in Unit.objects.all().iterator():
                unit.check()
        for arg in args:
            parts = arg.split('/')

--- a/trans/managers.py
+++ b/trans/managers.py
@@ -3,8 +3,12 @@ from django.conf import settings

 from lang.models import Language

+from whoosh import qparser
+
 from util import is_plural, split_plural, join_plural, msg_checksum

+import trans.search
+
 IGNORE_WORDS = set([
    'a',
    'an',
@@ -138,107 +142,72 @@ class UnitManager(models.Manager):
        else:
            return self.all()

-    def is_indexed(self, unit):
-        from ftsearch.models import WordLocation
-        return WordLocation.objects.filter(unit = unit).exists()
-
-    def remove_from_index(self, unit):
-        from ftsearch.models import WordLocation
-        return WordLocation.objects.filter(unit = unit).delete()
-
-    def separate_words(self, words):
-        return settings.SEARCH_WORD_SPLIT_REGEX.split(words)
-
-    def get_similar_list(self, words):
-        words = [word.lower() for word in self.separate_words(words)]
-        return [word for word in words if not word in IGNORE_SIMILAR and len(word) > 0]
-
-    def __index_item(self, text, language, unit):
-        from ftsearch.models import WordLocation, Word
-
-        # Split to words
-        p = settings.SEARCH_STEMMER()
-        stemmed_text = [p.stem(s.lower()) for s in self.separate_words(text) if s != '']
-
-        # Store words in database
-        for i, word in enumerate(stemmed_text):
-            if word in IGNORE_WORDS:
-                continue
-
-            wordobj, created = Word.objects.get_or_create(
-                word = word,
-                language = language
-            )
-            WordLocation.objects.create(
-                unit = unit,
-                word = wordobj,
-                location = i
+    def add_to_source_index(self, checksum, source, context, translation, writer):
+        writer.update_document(
+            checksum = checksum,
+            source = source,
+            context = context,
+            translation = translation,
        )

-    def add_to_index(self, unit):
-        from ftsearch.models import WordLocation
-
-        # Remove if it is already indexed
-        if self.is_indexed(unit):
-            self.remove_from_index(unit)
-
-        # Index source
-        self.__index_item('\n'.join(unit.get_source_plurals()), Language.objects.get(code = 'en'), unit)
-        # Index translation
-        self.__index_item('\n'.join(unit.get_target_plurals()), unit.translation.language, unit)
-        # Index context
-        if unit.context != '':
-            self.__index_item(unit.context, None, unit)
-
-    def __get_match_rows(self, query, language):
-        from ftsearch.models import Word
-        # Grab relevant words
-        word_objects = Word.objects.filter(word__in = query, language = language)
-
-        field_list = 'w0.unit_id'
-        table_list = ''
-        clause_list = ''
-
-        table_number = 0
-
-        for word in word_objects:
-
-            if table_number > 0:
-                table_list += ', '
-                clause_list += ' and w%d.unit_id = w%d.unit_id and ' \
-                               % (table_number - 1, table_number)
-
-            table_list += 'ftsearch_wordlocation w%d' % table_number
-            clause_list += 'w%d.word_id=%d' % (table_number, word.id)
-
-            table_number += 1
-
-        if not table_list or not clause_list:
-            return []
-
-        cur = connection.cursor()
-        cur.execute('select %s from %s where %s' \
-                % (field_list, table_list, clause_list))
-
-        rows = cur.fetchall()
-
-        return [row[0] for row in rows]
+    def add_to_target_index(self, checksum, target, translation, writer):
+        writer.update_document(
+            checksum = checksum,
+            target = target,
+            translation = translation,
+        )

-    def search(self, query, language):
-        from trans.models import Unit
-        if isinstance(query, str) or isinstance(query, unicode):
-            # split the string into a list of search terms
-            query = self.separate_words(query)
-        elif not isinstance(query, list) and  not isinstance(query, tuple):
-            raise TypeError("search must be called with a string or a list")
-
-        p = settings.SEARCH_STEMMER()
-        # lowercase and stem each word
-        stemmed_query = [p.stem(s.lower()) for s in query if s != '']
-
-        # get a row from the db for each matching word
-        rows = self.__get_match_rows(stemmed_query, language)
-        if rows == []:
-            return self.none()
-
-        return self.filter(pk__in = rows)
+    def add_to_index(self, unit, writer_target = None, writer_source = None):
+        if writer_target is None:
+            writer_target = trans.search.get_target_writer(unit.target.language.code)
+        if writer_source is None:
+            writer_source = trans.search.get_source_writer()
+
+        self.add_to_source_index(
+            unit.checksum,
+            unit.source,
+            unit.context,
+            unit.translation_id,
+            writer_source)
+        self.add_to_target_index(
+            unit.checksum,
+            unit.target,
+            unit.translation_id,
+            writer_target)
+
+    def search(self, query, source = True, context = True, translation = True):
+        ret = []
+        sample = self.all()[0]
+        if source or context:
+            with trans.search.get_source_searcher() as searcher:
+                if source:
+                    qp = qparser.QueryParser('source', trans.search.SourceSchema())
+                    q = qp.parse(query)
+                    for doc in searcher.docs_for_query(q):
+                        ret.append(searcher.stored_fields(doc)['checksum'])
+                if context:
+                    qp = qparser.QueryParser('context', trans.search.SourceSchema())
+                    q = qp.parse(query)
+                    for doc in searcher.docs_for_query(q):
+                        ret.append(searcher.stored_fields(doc)['checksum'])
+
+        if translation:
+            with trans.search.get_target_searcher(sample.translation.language.code) as searcher:
+                qp = qparser.QueryParser('target', trans.search.TargetSchema())
+                q = qp.parse(query)
+                for doc in searcher.docs_for_query(q):
+                    ret.append(searcher.stored_fields(doc)['checksum'])
+
+        return self.filter(checksum__in = ret)
+
+    def similar(self, unit):
+        ret = []
+        with trans.search.get_source_searcher() as searcher:
+            doc = searcher.document_number(checksum = unit.checksum)
+            mlt = searcher.more_like(doc, 'source', unit.source)
+            for m in mlt:
+                ret.append(m['checksum'])
+        return self.filter(
+                    translation__subproject__project = unit.translation.subproject.project,
+                    translation__language = unit.translation.language,
+                    checksum__in = ret).exclude(id = unit.id)
--- a/trans/search.py
+++ b/trans/search.py
+'''
+Whoosh based full text search.
+'''
+
+import whoosh
+import os
+from whoosh.fields import SchemaClass, TEXT, ID, NUMERIC
+from django.db.models.signals import post_syncdb
+from django.conf import settings
+from whoosh import index
+from whoosh.writing import BufferedWriter
+
+class TargetSchema(SchemaClass):
+    checksum = ID(stored = True, unique = True)
+    target = TEXT
+    translation = NUMERIC
+
+class SourceSchema(SchemaClass):
+    checksum = ID(stored = True, unique = True)
+    source = TEXT
+    context = TEXT
+    translation = NUMERIC
+
+def create_source_index():
+    return index.create_in(
+        settings.WHOOSH_INDEX,
+        schema = SourceSchema,
+        indexname = 'source'
+    )
+
+def create_target_index(lang):
+    return index.create_in(
+        settings.WHOOSH_INDEX,
+        schema = TargetSchema,
+        indexname = 'target-%s' % lang
+    )
+
+def create_index(sender=None, **kwargs):
+    if not os.path.exists(settings.WHOOSH_INDEX):
+        os.mkdir(settings.WHOOSH_INDEX)
+        create_source_index()
+
+post_syncdb.connect(create_index)
+
+def get_source_index():
+    if not hasattr(get_source_index, 'ix_source'):
+        get_source_index.ix_source = index.open_dir(
+            settings.WHOOSH_INDEX,
+            indexname = 'source'
+        )
+    return get_source_index.ix_source
+
+def get_target_index(lang):
+    if not hasattr(get_target_index, 'ix_target'):
+        get_target_index.ix_target = {}
+    if not lang in get_target_index.ix_target:
+        try:
+            get_target_index.ix_target[lang] = index.open_dir(
+                settings.WHOOSH_INDEX,
+                indexname = 'target-%s' % lang
+            )
+        except whoosh.index.EmptyIndexError:
+            get_target_index.ix_target[lang] = create_target_index(lang)
+    return get_target_index.ix_target[lang]
+
+def get_source_writer(buffered = True):
+    if not buffered:
+        return get_source_index().writer()
+    if not hasattr(get_source_writer, 'source_writer'):
+        get_source_writer.source_writer = BufferedWriter(get_source_index())
+    return get_source_writer.source_writer
+
+def get_target_writer(lang, buffered = True):
+    if not buffered:
+        return get_target_index(lang).writer()
+    if not hasattr(get_target_writer, 'target_writer'):
+        get_target_writer.target_writer = {}
+    if not lang in get_target_writer.target_writer:
+        get_target_writer.target_writer[lang] = BufferedWriter(get_target_index(lang))
+    return get_target_writer.target_writer[lang]
+
+def get_source_searcher():
+    return get_source_writer().searcher()
+
+def get_target_searcher(lang):
+    return get_target_writer(lang).searcher()
--- a/trans/views.py
+++ b/trans/views.py
@@ -302,13 +302,7 @@ def translate(request, project, subproject, lang):
                    query |= Q(context = search_query)
                units = units.filter(query)
            else:
-                units = obj.unit_set.none()
-                if search_source:
-                    units |= obj.unit_set.search(search_query, Language.objects.get(code = 'en'))
-                if search_target:
-                    units |= obj.unit_set.search(search_query, obj.language)
-                if search_context:
-                    units |= obj.unit_set.search(search_query, None)
+                units = obj.unit_set.search(search_query, search_source, search_context, search_target)
            if direction == 'stay':
                units = units.filter(position = pos)
            elif direction == 'back':
@@ -388,27 +382,8 @@ def get_string(request, checksum):

 def get_similar(request, unit_id):
    unit = get_object_or_404(Unit, pk = int(unit_id))
-    words = Unit.objects.get_similar_list(unit.get_source_plurals()[0])
-    similar = Unit.objects.none()
-    cnt = min(len(words), 5)
-    # Try to find 10 similar string, remove up to 5 words
-    while similar.count() < 10 and cnt > 0 and len(words) - cnt < 5:
-        for search in itertools.combinations(words, cnt):
-            similar |= Unit.objects.search(search, Language.objects.get(code = 'en')).filter(
-                translation__subproject__project = unit.translation.subproject.project,
-                translation__language = unit.translation.language).exclude(id = unit.id)
-        cnt -= 1

-    # distinct('target') works with Django 1.4 so let's emulate that
-    # based on presumption we won't get too many results
-    targets = {}
-    res = []
-    for s in similar:
-        if s.target in targets:
-            continue
-        targets[s.target] = 1
-        res.append(s)
-    similar = res
+    similar = Unit.objects.similar(unit)

    return render_to_response('similar.html', RequestContext(request, {
        'similar': similar,