more delailed sphinx.conf template using English stemming rule for now.

29f65832 · Kazuhiko Shiozaki · f0060584 · 29f65832
Commit 29f65832 authored Sep 21, 2011 by Kazuhiko Shiozaki
Hide whitespace changes
Inline Side-by-side

Showing with 182 additions and 0 deletions

slapos/recipe/erp5/template/sphinx.conf.in slapos/recipe/erp5/template/sphinx.conf.in +182 -0

No files found.
--- a/slapos/recipe/erp5/template/sphinx.conf.in
+++ b/slapos/recipe/erp5/template/sphinx.conf.in
@@ -38,6 +38,57 @@ index erp5
 	# rt_attr_timestamp	= ts_added
 	# rt_attr_string		= author

+	# document attribute values (docinfo) storage mode
+	# optional, default is 'extern'
+	# known values are 'none', 'extern' and 'inline'
+	# docinfo			= extern
+
+	# memory locking for cached data (.spa and .spi), to prevent swapping
+	# optional, default is 0 (do not mlock)
+	# requires searchd to be run from root
+	# mlock			= 0
+
+	# a list of morphology preprocessors to apply
+	# optional, default is empty
+	#
+	# builtin preprocessors are 'none', 'stem_en', 'stem_ru', 'stem_enru',
+	# 'soundex', and 'metaphone'; additional preprocessors available from
+	# libstemmer are 'libstemmer_XXX', where XXX is algorithm code
+	# (see libstemmer_c/libstemmer/modules.txt)
+	#
+	# morphology		= stem_en, stem_ru, soundex
+	# morphology		= libstemmer_german
+	# morphology		= libstemmer_sv
+	morphology		= stem_en
+
+	# minimum word length at which to enable stemming
+	# optional, default is 1 (stem everything)
+	#
+	# min_stemming_len	= 1
+
+	# stopword files list (space separated)
+	# optional, default is empty
+	# contents are plain text, charset_table and stemming are both applied
+	#
+	# stopwords		= %(data_directory)s/erp5/stopwords.txt
+
+	# wordforms file, in "mapfrom > mapto" plain text format
+	# optional, default is empty
+	#
+	# wordforms		= %(data_directory)s/erp5/wordforms.txt
+
+	# tokenizing exceptions file
+	# optional, default is empty
+	#
+	# plain text, case sensitive, space insensitive in map-from part
+	# one "Map Several Words => ToASingleOne" entry per line
+	#
+	# exceptions		= %(data_directory)s/erp5/exceptions.txt
+
+	# minimum indexed word length
+	# default is 1 (index everything)
+	min_word_len		= 1
+
 	# charset encoding type
 	# optional, default is 'sbcs'
 	# known types are 'sbcs' (Single Byte CharSet) and 'utf-8'
@@ -245,6 +296,39 @@ index erp5
 		U+0B92, U+0B93, U+0B95, U+0B99, U+0B9A, U+0B9C, U+0B9E, U+0B9F, U+0BA3, U+0BA4, U+0BA8..U+0BAA, U+0BAE..U+0BB9, U+0BE6..U+0BEF, U+0E01..U+0E30, U+0E32, U+0E33, U+0E40..U+0E46, U+0E50..U+0E5B, U+FF10..U+FF19->0..9, U+FF21..U+FF3A->a..z, U+FF41..U+FF5A->a..z,  \
 		0..9, A..Z->a..z, a..z

+	# ignored characters list
+	# optional, default value is empty
+	#
+	# ignore_chars		= U+00AD
+
+	# minimum word prefix length to index
+	# optional, default is 0 (do not index prefixes)
+	#
+	# min_prefix_len		= 0
+
+	# minimum word infix length to index
+	# optional, default is 0 (do not index infixes)
+	#
+	# min_infix_len		= 0
+
+	# list of fields to limit prefix/infix indexing to
+	# optional, default value is empty (index all fields in prefix/infix mode)
+	#
+	# prefix_fields		= filename
+	# infix_fields		= url, domain
+
+	# enable star-syntax (wildcards) when searching prefix/infix indexes
+	# search-time only, does not affect indexing, can be 0 or 1
+	# optional, default is 0 (do not use wildcard syntax)
+	#
+	# enable_star		= 1
+
+	# expand keywords with exact forms and/or stars when searching fit indexes
+	# search-time only, does not affect indexing, can be 0 or 1
+	# optional, default is 0 (do not expand keywords)
+	#
+	# expand_keywords		= 1
+
 	# n-gram length to index, for CJK indexing
 	# only supports 0 and 1 for now, other lengths to be implemented
 	# optional, default is 0 (disable n-grams)
@@ -255,6 +339,104 @@ index erp5
 	# optional, default is empty
 	#
 	ngram_chars		= U+4E00..U+9FBB, U+3400..U+4DB5, U+20000..U+2A6D6, U+FA0E, U+FA0F, U+FA11, U+FA13, U+FA14, U+FA1F, U+FA21, U+FA23, U+FA24, U+FA27, U+FA28, U+FA29, U+3105..U+312C, U+31A0..U+31B7, U+3041, U+3043, U+3045, U+3047, U+3049, U+304B, U+304D, U+304F, U+3051, U+3053, U+3055, U+3057, U+3059, U+305B, U+305D, U+305F, U+3061, U+3063, U+3066, U+3068, U+306A..U+306F, U+3072, U+3075, U+3078, U+307B, U+307E..U+3083, U+3085, U+3087, U+3089..U+308E, U+3090..U+3093, U+30A1, U+30A3, U+30A5, U+30A7, U+30A9, U+30AD, U+30AF, U+30B3, U+30B5, U+30BB, U+30BD, U+30BF, U+30C1, U+30C3, U+30C4, U+30C6, U+30CA, U+30CB, U+30CD, U+30CE, U+30DE, U+30DF, U+30E1, U+30E2, U+30E3, U+30E5, U+30E7, U+30EE, U+30F0..U+30F3, U+30F5, U+30F6, U+31F0, U+31F1, U+31F2, U+31F3, U+31F4, U+31F5, U+31F6, U+31F7, U+31F8, U+31F9, U+31FA, U+31FB, U+31FC, U+31FD, U+31FE, U+31FF, U+AC00..U+D7A3, U+1100..U+1159, U+1161..U+11A2, U+11A8..U+11F9, U+A000..U+A48C, U+A492..U+A4C6
+
+	# phrase boundary characters list
+	# optional, default is empty
+	#
+	# phrase_boundary		= ., ?, !, U+2026 # horizontal ellipsis
+
+	# phrase boundary word position increment
+	# optional, default is 0
+	#
+	# phrase_boundary_step	= 100
+
+	# blended characters list
+	# blended chars are indexed both as separators and valid characters
+	# for instance, AT&T will results in 3 tokens ("at", "t", and "at&t")
+	# optional, default is empty
+	#
+	# blend_chars		= +, &, U+23
+
+	# blended token indexing mode
+	# a comma separated list of blended token indexing variants
+	# known variants are trim_none, trim_head, trim_tail, trim_both, skip_pure
+	# optional, default is trim_none
+	#
+	# blend_mode		= trim_tail, skip_pure
+
+	# whether to strip HTML tags from incoming documents
+	# known values are 0 (do not strip) and 1 (do strip)
+	# optional, default is 0
+	html_strip		= 0
+
+	# what HTML attributes to index if stripping HTML
+	# optional, default is empty (do not index anything)
+	#
+	# html_index_attrs	= img=alt,title; a=title;
+
+	# what HTML elements contents to strip
+	# optional, default is empty (do not strip element contents)
+	#
+	# html_remove_elements	= style, script
+
+	# whether to preopen index data files on startup
+	# optional, default is 0 (do not preopen), searchd-only
+	#
+	# preopen			= 1
+
+	# whether to keep dictionary (.spi) on disk, or cache it in RAM
+	# optional, default is 0 (cache in RAM), searchd-only
+	#
+	# ondisk_dict		= 1
+
+	# whether to enable in-place inversion (2x less disk, 90-95% speed)
+	# optional, default is 0 (use separate temporary files), indexer-only
+	#
+	# inplace_enable		= 1
+
+	# in-place fine-tuning options
+	# optional, defaults are listed below
+	#
+	# inplace_hit_gap		= 0 # preallocated hitlist gap size
+	# inplace_docinfo_gap	= 0 # preallocated docinfo gap size
+	# inplace_reloc_factor	= 0.1 # relocation buffer size within arena
+	# inplace_write_factor	= 0.1 # write buffer size within arena
+
+	# whether to index original keywords along with stemmed versions
+	# enables "=exactform" operator to work
+	# optional, default is 0
+	#
+	# index_exact_words	= 1
+
+	# position increment on overshort (less that min_word_len) words
+	# optional, allowed values are 0 and 1, default is 1
+	#
+	# overshort_step		= 1
+
+	# position increment on stopword
+	# optional, allowed values are 0 and 1, default is 1
+	#
+	# stopword_step		= 1
+
+	# hitless words list
+	# positions for these keywords will not be stored in the index
+	# optional, allowed values are 'all', or a list file name
+	#
+	# hitless_words		= all
+	# hitless_words		= hitless.txt
+
+	# detect and index sentence and paragraph boundaries
+	# required for the SENTENCE and PARAGRAPH operators to work
+	# optional, allowed values are 0 and 1, default is 0
+	#
+	# index_sp			= 1
+
+	# index zones, delimited by HTML/XML tags
+	# a comma separated list of tags and wildcards
+	# required for the ZONE operator to work
+	# optional, default is empty string (do not index zones)
+	#
+	# index_zones		= title, h*, th
 }

 #############################################################################