Commit b9a10f9a authored by Michal Čihař's avatar Michal Čihař

Improved dictionary matching for several languages.

- We use Whoosh per language analyzer if available
- We use ngram analyzer for CJK

Fixes #736
Signed-off-by: default avatarMichal Čihař <michal@cihar.com>
parent ace46478
......@@ -24,6 +24,7 @@ Released on ? 2015.
* Support for adding new translations in XLIFF.
* Improved file format autodetection.
* Extended keyboard shortcuts.
* Improved dictionary matching for several languages.
weblate 2.3
-----------
......
......@@ -483,3 +483,10 @@ class Language(models.Model, PercentMixin):
elif self.code == 'pt_BR':
self.nplurals = 2
self.pluralequation = 'n > 1'
def base_code(self):
return self.code.replace('_', '-').split('-')[0]
def uses_ngram(self):
code = self.base_code()
return code in ('ja', 'zh', 'ko')
......@@ -26,7 +26,10 @@ from weblate.trans.formats import AutoFormat, StringIOMode
from weblate.trans.models.project import Project
from translate.storage.csvl10n import csvfile
from django.core.urlresolvers import reverse
from whoosh.analysis import StandardAnalyzer, StemmingAnalyzer
from whoosh.analysis import (
LanguageAnalyzer, StandardAnalyzer, StemmingAnalyzer, NgramAnalyzer
)
from whoosh.lang import has_stemmer
class DictionaryManager(models.Manager):
......@@ -133,7 +136,17 @@ class DictionaryManager(models.Manager):
# Prepare analyzers
# - standard analyzer simply splits words
# - stemming extracts stems, to catch things like plurals
analyzers = (StandardAnalyzer(), StemmingAnalyzer())
analyzers = [
StandardAnalyzer(),
StemmingAnalyzer(),
]
lang_code = unit.translation.language.base_code()
# Add per language analyzer if Whoosh has it
if has_stemmer(lang_code):
analyzers.append(LanguageAnalyzer(lang_code))
# Add ngram analyzer for languages like Chinese or Japanese
if unit.translation.language.uses_ngram():
analyzers.append(NGramAnalyzer())
# Extract words from all plurals and from context
for text in unit.get_source_plurals() + [unit.context]:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment