Commit b9a10f9a authored by Michal Čihař's avatar Michal Čihař

Improved dictionary matching for several languages.

- We use Whoosh per language analyzer if available
- We use ngram analyzer for CJK

Fixes #736
Signed-off-by: default avatarMichal Čihař <michal@cihar.com>
parent ace46478
...@@ -24,6 +24,7 @@ Released on ? 2015. ...@@ -24,6 +24,7 @@ Released on ? 2015.
* Support for adding new translations in XLIFF. * Support for adding new translations in XLIFF.
* Improved file format autodetection. * Improved file format autodetection.
* Extended keyboard shortcuts. * Extended keyboard shortcuts.
* Improved dictionary matching for several languages.
weblate 2.3 weblate 2.3
----------- -----------
......
...@@ -483,3 +483,10 @@ class Language(models.Model, PercentMixin): ...@@ -483,3 +483,10 @@ class Language(models.Model, PercentMixin):
elif self.code == 'pt_BR': elif self.code == 'pt_BR':
self.nplurals = 2 self.nplurals = 2
self.pluralequation = 'n > 1' self.pluralequation = 'n > 1'
def base_code(self):
return self.code.replace('_', '-').split('-')[0]
def uses_ngram(self):
code = self.base_code()
return code in ('ja', 'zh', 'ko')
...@@ -26,7 +26,10 @@ from weblate.trans.formats import AutoFormat, StringIOMode ...@@ -26,7 +26,10 @@ from weblate.trans.formats import AutoFormat, StringIOMode
from weblate.trans.models.project import Project from weblate.trans.models.project import Project
from translate.storage.csvl10n import csvfile from translate.storage.csvl10n import csvfile
from django.core.urlresolvers import reverse from django.core.urlresolvers import reverse
from whoosh.analysis import StandardAnalyzer, StemmingAnalyzer from whoosh.analysis import (
LanguageAnalyzer, StandardAnalyzer, StemmingAnalyzer, NgramAnalyzer
)
from whoosh.lang import has_stemmer
class DictionaryManager(models.Manager): class DictionaryManager(models.Manager):
...@@ -133,7 +136,17 @@ class DictionaryManager(models.Manager): ...@@ -133,7 +136,17 @@ class DictionaryManager(models.Manager):
# Prepare analyzers # Prepare analyzers
# - standard analyzer simply splits words # - standard analyzer simply splits words
# - stemming extracts stems, to catch things like plurals # - stemming extracts stems, to catch things like plurals
analyzers = (StandardAnalyzer(), StemmingAnalyzer()) analyzers = [
StandardAnalyzer(),
StemmingAnalyzer(),
]
lang_code = unit.translation.language.base_code()
# Add per language analyzer if Whoosh has it
if has_stemmer(lang_code):
analyzers.append(LanguageAnalyzer(lang_code))
# Add ngram analyzer for languages like Chinese or Japanese
if unit.translation.language.uses_ngram():
analyzers.append(NGramAnalyzer())
# Extract words from all plurals and from context # Extract words from all plurals and from context
for text in unit.get_source_plurals() + [unit.context]: for text in unit.get_source_plurals() + [unit.context]:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment