Commit 9e6d4635 authored by Robert Speicher's avatar Robert Speicher

Merge branch '225171-review-global-search-code-regexes-in-elasticsearch-for-accuracy' into 'master'

Simplify Advanced Search code_analyzer regex complexity

See merge request gitlab-org/gitlab!37372
parents e8fc3f98 167147da
---
title: Simplify Advanced Search code_analyzer regex complexity
merge_request: 37372
author:
type: changed
...@@ -7,6 +7,15 @@ module Elastic ...@@ -7,6 +7,15 @@ module Elastic
extend Elasticsearch::Model::Indexing::ClassMethods extend Elasticsearch::Model::Indexing::ClassMethods
extend Elasticsearch::Model::Naming::ClassMethods extend Elasticsearch::Model::Naming::ClassMethods
# Regex patterns, Elasticsearch regex requires backslash characters to be escaped
# Single quotes are used to store the patterns and ruby will escape the \ appropriately
ANY_CASE_WORD_PATTERN = '(\p{Ll}+|\p{Lu}\p{Ll}+|\p{Lu}+)' # match words with any upper/lowercase combination
CAMEL_CASE_WORD_PATTERN = '(?=([\p{Lu}]+[\p{L}]+))' # match camel cased words, used to split into smaller tokens
CODE_TOKEN_PATTERN = '([\p{L}\d_]+)' # letters, numbers & underscores are the most common tokens in programming. Always capture them greedily regardless of context.
DIGIT_PATTERN = '(\d+)' # match digits of any length
FILE_NAME_PATTERN = '([\p{L}\p{N}_.-]+)' # some common chars in file names to keep the whole filename intact (eg. my_file-name-01.txt)
PERIOD_PATTERN = '\.([^.]+)(?=\.|\s|\Z)' # separate terms on periods
self.index_name = [Rails.application.class.module_parent_name.downcase, Rails.env].join('-') self.index_name = [Rails.application.class.module_parent_name.downcase, Rails.env].join('-')
# ES6 requires a single type per index # ES6 requires a single type per index
...@@ -61,14 +70,12 @@ module Elastic ...@@ -61,14 +70,12 @@ module Elastic
type: "pattern_capture", type: "pattern_capture",
preserve_original: true, preserve_original: true,
patterns: [ patterns: [
"(\\p{Ll}+|\\p{Lu}\\p{Ll}+|\\p{Lu}+)", ANY_CASE_WORD_PATTERN,
"(\\d+)", CAMEL_CASE_WORD_PATTERN,
"(?=([\\p{Lu}]+[\\p{L}]+))", CODE_TOKEN_PATTERN,
'"((?:\\"|[^"]|\\")*)"', # capture terms inside quotes, removing the quotes DIGIT_PATTERN,
"'((?:\\'|[^']|\\')*)'", # same as above, for single quotes FILE_NAME_PATTERN,
'\.([^.]+)(?=\.|\s|\Z)', # separate terms on periods PERIOD_PATTERN
'([\p{L}_.-]+)', # some common chars in file names to keep the whole filename intact (eg. my_file-name.txt)
'([\p{L}\d_]+)' # letters, numbers and underscores are the most common tokens in programming. Always capture them greedily regardless of context.
] ]
} }
}, },
......
...@@ -701,7 +701,7 @@ RSpec.describe Gitlab::Elastic::SearchResults, :elastic, :sidekiq_might_not_need ...@@ -701,7 +701,7 @@ RSpec.describe Gitlab::Elastic::SearchResults, :elastic, :sidekiq_might_not_need
expect(search_for('missing_token_around_equals')).to include(file_name) expect(search_for('missing_token_around_equals')).to include(file_name)
end end
it 'finds a ruby method name even if preceeded with dot' do it 'finds a ruby method name even if preceded with dot' do
expect(search_for('ruby_method_name')).to include(file_name) expect(search_for('ruby_method_name')).to include(file_name)
end end
...@@ -709,7 +709,7 @@ RSpec.describe Gitlab::Elastic::SearchResults, :elastic, :sidekiq_might_not_need ...@@ -709,7 +709,7 @@ RSpec.describe Gitlab::Elastic::SearchResults, :elastic, :sidekiq_might_not_need
expect(search_for('ruby_method_123')).to include(file_name) expect(search_for('ruby_method_123')).to include(file_name)
end end
it 'finds a ruby method call even if preceeded with dot' do it 'finds a ruby method call even if preceded with dot' do
expect(search_for('ruby_method_call')).to include(file_name) expect(search_for('ruby_method_call')).to include(file_name)
end end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment