Commit d3f63cc5 authored by Michal Čihař's avatar Michal Čihař

Merge remote-tracking branch 'origin/master'

parents 080093c8 455aeed5
......@@ -2,3 +2,4 @@
*.swp
repos/
*.mo
whoosh-index/
"""
Django Full-text search
Author: Patrick Carroll <patrick@patrickomatic.com>
Version: 0.1
"""
import re
from django.conf import settings
from django.core.exceptions import ImproperlyConfigured
from ftsearch.stemming import PorterStemmer
try:
getattr(settings, 'SEARCH_STEMMER')
# make sure it has a callable .stem() method
try:
settings.SEARCH_STEMMER().stem('foo')
except AttributeError:
raise ImproperlyConfigured("The supplied stemmer must support a stem() method")
except AttributeError:
settings.SEARCH_STEMMER = PorterStemmer
try:
getattr(settings, 'SEARCH_WORD_SPLIT_REGEX')
except AttributeError:
settings.SEARCH_WORD_SPLIT_REGEX = re.compile(r'\W*')
from django.core.management.base import BaseCommand, CommandError
from trans.models import Unit
from ftsearch.models import WordLocation, Word
from optparse import make_option
class Command(BaseCommand):
help = 'updates index for fulltext search'
option_list = BaseCommand.option_list + (
make_option('--clean',
action='store_true',
dest='clean',
default=False,
help='removes also all words from database'),
)
def handle(self, *args, **options):
if options['clean']:
Word.objects.all().delete()
WordLocation.objects.all().delete()
units = Unit.objects.all()
for unit in units:
Unit.objects.add_to_index(unit)
"""
Code based on Django Full-text search
"""
from django.db import models
from lang.models import Language
from trans.models import Unit
class Word(models.Model):
word = models.CharField(max_length=255)
language = models.ForeignKey(Language, null = True, blank = True)
def __unicode__(self):
return "%s: %s" % (self.language.name, self.word)
class Meta:
unique_together = ('word', 'language')
class WordLocation(models.Model):
word = models.ForeignKey(Word)
location = models.PositiveIntegerField()
unit = models.ForeignKey(Unit)
def __unicode__(self):
return "%s[%d] (%d)" % (self.word, self.location, self.unit.id)
This diff is collapsed.
......@@ -140,7 +140,6 @@ INSTALLED_APPS = (
'trans',
'lang',
'accounts',
'ftsearch',
'weblate',
)
......@@ -252,3 +251,6 @@ ENABLE_HOOKS = True
# Number of nearby messages to show in each direction
NEARBY_MESSAGES = 5
# Where to put Whoosh index
WHOOSH_INDEX = os.path.join(WEB_ROOT, 'whoosh-index')
from django.core.management.base import BaseCommand, CommandError
from trans.models import Unit
from lang.models import Language
import trans.search
from optparse import make_option
class Command(BaseCommand):
help = 'updates index for fulltext search'
option_list = BaseCommand.option_list + (
make_option('--clean',
action='store_true',
dest='clean',
default=False,
help='removes also all words from database'),
)
def handle(self, *args, **options):
languages = Language.objects.all()
if options['clean']:
trans.search.create_source_index()
for lang in languages:
trans.search.create_target_index(lang = lang.code)
with trans.search.get_source_writer(buffered = False) as writer:
for unit in Unit.objects.values('checksum', 'source', 'context', 'translation_id').distinct().iterator():
Unit.objects.add_to_source_index(
unit['checksum'],
unit['source'],
unit['context'],
unit['translation_id'],
writer)
for lang in languages:
with trans.search.get_target_writer(lang = lang.code, buffered = False) as writer:
for unit in Unit.objects.filter(translation__language =
lang).exclude(target = '').values('checksum', 'target', 'translation_id').iterator():
Unit.objects.add_to_target_index(
unit['checksum'],
unit['target'],
unit['translation_id'],
writer)
from django.core.management.base import BaseCommand, CommandError
from trans.models import Unit
from ftsearch.models import WordLocation, Word
from optparse import make_option
class Command(BaseCommand):
......@@ -17,7 +16,7 @@ class Command(BaseCommand):
def handle(self, *args, **options):
if options['all']:
for unit in Unit.objects.all():
for unit in Unit.objects.all().iterator():
unit.check()
for arg in args:
parts = arg.split('/')
......
......@@ -3,8 +3,12 @@ from django.conf import settings
from lang.models import Language
from whoosh import qparser
from util import is_plural, split_plural, join_plural, msg_checksum
import trans.search
IGNORE_WORDS = set([
'a',
'an',
......@@ -138,107 +142,72 @@ class UnitManager(models.Manager):
else:
return self.all()
def is_indexed(self, unit):
from ftsearch.models import WordLocation
return WordLocation.objects.filter(unit = unit).exists()
def remove_from_index(self, unit):
from ftsearch.models import WordLocation
return WordLocation.objects.filter(unit = unit).delete()
def separate_words(self, words):
return settings.SEARCH_WORD_SPLIT_REGEX.split(words)
def get_similar_list(self, words):
words = [word.lower() for word in self.separate_words(words)]
return [word for word in words if not word in IGNORE_SIMILAR and len(word) > 0]
def __index_item(self, text, language, unit):
from ftsearch.models import WordLocation, Word
# Split to words
p = settings.SEARCH_STEMMER()
stemmed_text = [p.stem(s.lower()) for s in self.separate_words(text) if s != '']
# Store words in database
for i, word in enumerate(stemmed_text):
if word in IGNORE_WORDS:
continue
wordobj, created = Word.objects.get_or_create(
word = word,
language = language
)
WordLocation.objects.create(
unit = unit,
word = wordobj,
location = i
)
def add_to_index(self, unit):
from ftsearch.models import WordLocation
# Remove if it is already indexed
if self.is_indexed(unit):
self.remove_from_index(unit)
# Index source
self.__index_item('\n'.join(unit.get_source_plurals()), Language.objects.get(code = 'en'), unit)
# Index translation
self.__index_item('\n'.join(unit.get_target_plurals()), unit.translation.language, unit)
# Index context
if unit.context != '':
self.__index_item(unit.context, None, unit)
def __get_match_rows(self, query, language):
from ftsearch.models import Word
# Grab relevant words
word_objects = Word.objects.filter(word__in = query, language = language)
field_list = 'w0.unit_id'
table_list = ''
clause_list = ''
table_number = 0
for word in word_objects:
if table_number > 0:
table_list += ', '
clause_list += ' and w%d.unit_id = w%d.unit_id and ' \
% (table_number - 1, table_number)
table_list += 'ftsearch_wordlocation w%d' % table_number
clause_list += 'w%d.word_id=%d' % (table_number, word.id)
table_number += 1
if not table_list or not clause_list:
return []
cur = connection.cursor()
cur.execute('select %s from %s where %s' \
% (field_list, table_list, clause_list))
rows = cur.fetchall()
return [row[0] for row in rows]
def search(self, query, language):
from trans.models import Unit
if isinstance(query, str) or isinstance(query, unicode):
# split the string into a list of search terms
query = self.separate_words(query)
elif not isinstance(query, list) and not isinstance(query, tuple):
raise TypeError("search must be called with a string or a list")
p = settings.SEARCH_STEMMER()
# lowercase and stem each word
stemmed_query = [p.stem(s.lower()) for s in query if s != '']
# get a row from the db for each matching word
rows = self.__get_match_rows(stemmed_query, language)
if rows == []:
return self.none()
return self.filter(pk__in = rows)
def add_to_source_index(self, checksum, source, context, translation, writer):
writer.update_document(
checksum = checksum,
source = source,
context = context,
translation = translation,
)
def add_to_target_index(self, checksum, target, translation, writer):
writer.update_document(
checksum = checksum,
target = target,
translation = translation,
)
def add_to_index(self, unit, writer_target = None, writer_source = None):
if writer_target is None:
writer_target = trans.search.get_target_writer(unit.target.language.code)
if writer_source is None:
writer_source = trans.search.get_source_writer()
self.add_to_source_index(
unit.checksum,
unit.source,
unit.context,
unit.translation_id,
writer_source)
self.add_to_target_index(
unit.checksum,
unit.target,
unit.translation_id,
writer_target)
def search(self, query, source = True, context = True, translation = True):
ret = []
sample = self.all()[0]
if source or context:
with trans.search.get_source_searcher() as searcher:
if source:
qp = qparser.QueryParser('source', trans.search.SourceSchema())
q = qp.parse(query)
for doc in searcher.docs_for_query(q):
ret.append(searcher.stored_fields(doc)['checksum'])
if context:
qp = qparser.QueryParser('context', trans.search.SourceSchema())
q = qp.parse(query)
for doc in searcher.docs_for_query(q):
ret.append(searcher.stored_fields(doc)['checksum'])
if translation:
with trans.search.get_target_searcher(sample.translation.language.code) as searcher:
qp = qparser.QueryParser('target', trans.search.TargetSchema())
q = qp.parse(query)
for doc in searcher.docs_for_query(q):
ret.append(searcher.stored_fields(doc)['checksum'])
return self.filter(checksum__in = ret)
def similar(self, unit):
ret = []
with trans.search.get_source_searcher() as searcher:
doc = searcher.document_number(checksum = unit.checksum)
mlt = searcher.more_like(doc, 'source', unit.source)
for m in mlt:
ret.append(m['checksum'])
return self.filter(
translation__subproject__project = unit.translation.subproject.project,
translation__language = unit.translation.language,
checksum__in = ret).exclude(id = unit.id)
'''
Whoosh based full text search.
'''
import whoosh
import os
from whoosh.fields import SchemaClass, TEXT, ID, NUMERIC
from django.db.models.signals import post_syncdb
from django.conf import settings
from whoosh import index
from whoosh.writing import BufferedWriter
class TargetSchema(SchemaClass):
checksum = ID(stored = True, unique = True)
target = TEXT
translation = NUMERIC
class SourceSchema(SchemaClass):
checksum = ID(stored = True, unique = True)
source = TEXT
context = TEXT
translation = NUMERIC
def create_source_index():
return index.create_in(
settings.WHOOSH_INDEX,
schema = SourceSchema,
indexname = 'source'
)
def create_target_index(lang):
return index.create_in(
settings.WHOOSH_INDEX,
schema = TargetSchema,
indexname = 'target-%s' % lang
)
def create_index(sender=None, **kwargs):
if not os.path.exists(settings.WHOOSH_INDEX):
os.mkdir(settings.WHOOSH_INDEX)
create_source_index()
post_syncdb.connect(create_index)
def get_source_index():
if not hasattr(get_source_index, 'ix_source'):
get_source_index.ix_source = index.open_dir(
settings.WHOOSH_INDEX,
indexname = 'source'
)
return get_source_index.ix_source
def get_target_index(lang):
if not hasattr(get_target_index, 'ix_target'):
get_target_index.ix_target = {}
if not lang in get_target_index.ix_target:
try:
get_target_index.ix_target[lang] = index.open_dir(
settings.WHOOSH_INDEX,
indexname = 'target-%s' % lang
)
except whoosh.index.EmptyIndexError:
get_target_index.ix_target[lang] = create_target_index(lang)
return get_target_index.ix_target[lang]
def get_source_writer(buffered = True):
if not buffered:
return get_source_index().writer()
if not hasattr(get_source_writer, 'source_writer'):
get_source_writer.source_writer = BufferedWriter(get_source_index())
return get_source_writer.source_writer
def get_target_writer(lang, buffered = True):
if not buffered:
return get_target_index(lang).writer()
if not hasattr(get_target_writer, 'target_writer'):
get_target_writer.target_writer = {}
if not lang in get_target_writer.target_writer:
get_target_writer.target_writer[lang] = BufferedWriter(get_target_index(lang))
return get_target_writer.target_writer[lang]
def get_source_searcher():
return get_source_writer().searcher()
def get_target_searcher(lang):
return get_target_writer(lang).searcher()
......@@ -302,13 +302,7 @@ def translate(request, project, subproject, lang):
query |= Q(context = search_query)
units = units.filter(query)
else:
units = obj.unit_set.none()
if search_source:
units |= obj.unit_set.search(search_query, Language.objects.get(code = 'en'))
if search_target:
units |= obj.unit_set.search(search_query, obj.language)
if search_context:
units |= obj.unit_set.search(search_query, None)
units = obj.unit_set.search(search_query, search_source, search_context, search_target)
if direction == 'stay':
units = units.filter(position = pos)
elif direction == 'back':
......@@ -388,27 +382,8 @@ def get_string(request, checksum):
def get_similar(request, unit_id):
unit = get_object_or_404(Unit, pk = int(unit_id))
words = Unit.objects.get_similar_list(unit.get_source_plurals()[0])
similar = Unit.objects.none()
cnt = min(len(words), 5)
# Try to find 10 similar string, remove up to 5 words
while similar.count() < 10 and cnt > 0 and len(words) - cnt < 5:
for search in itertools.combinations(words, cnt):
similar |= Unit.objects.search(search, Language.objects.get(code = 'en')).filter(
translation__subproject__project = unit.translation.subproject.project,
translation__language = unit.translation.language).exclude(id = unit.id)
cnt -= 1
# distinct('target') works with Django 1.4 so let's emulate that
# based on presumption we won't get too many results
targets = {}
res = []
for s in similar:
if s.target in targets:
continue
targets[s.target] = 1
res.append(s)
similar = res
similar = Unit.objects.similar(unit)
return render_to_response('similar.html', RequestContext(request, {
'similar': similar,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment