Merge branch '225171-review-global-search-code-regexes-in-elasticsearch-for-accuracy' into 'master'

Simplify Advanced Search code_analyzer regex complexity See merge request gitlab-org/gitlab!37372

Merge branch '225171-review-global-search-code-regexes-in-elasticsearch-for-accuracy' into 'master'
Simplify Advanced Search code_analyzer regex complexity See merge request gitlab-org/gitlab!37372
9e6d4635 · Robert Speicher · e8fc3f98 · 167147da · 9e6d4635 · 9e6d4635
Commit 9e6d4635 authored Aug 07, 2020 by Robert Speicher
3 changed files
--- a/ee/changelogs/unreleased/225171-review-global-search-code-regexes-in-elasticsearch-for-accuracy.yml
+++ b/ee/changelogs/unreleased/225171-review-global-search-code-regexes-in-elasticsearch-for-accuracy.yml
+---
+title: Simplify Advanced Search code_analyzer regex complexity
+merge_request: 37372
+author:
+type: changed
--- a/ee/lib/elastic/latest/config.rb
+++ b/ee/lib/elastic/latest/config.rb
@@ -7,6 +7,15 @@ module Elastic
      extend Elasticsearch::Model::Indexing::ClassMethods
      extend Elasticsearch::Model::Naming::ClassMethods
+      # Regex patterns, Elasticsearch regex requires backslash characters to be escaped
+      # Single quotes are used to store the patterns and ruby will escape the \ appropriately
+      ANY_CASE_WORD_PATTERN = '(\p{Ll}+|\p{Lu}\p{Ll}+|\p{Lu}+)' # match words with any upper/lowercase combination
+      CAMEL_CASE_WORD_PATTERN = '(?=([\p{Lu}]+[\p{L}]+))' # match camel cased words, used to split into smaller tokens
+      CODE_TOKEN_PATTERN = '([\p{L}\d_]+)' # letters, numbers & underscores are the most common tokens in programming. Always capture them greedily regardless of context.
+      DIGIT_PATTERN = '(\d+)' # match digits of any length
+      FILE_NAME_PATTERN = '([\p{L}\p{N}_.-]+)' # some common chars in file names to keep the whole filename intact (eg. my_file-name-01.txt)
+      PERIOD_PATTERN = '\.([^.]+)(?=\.|\s|\Z)' # separate terms on periods
      self.index_name = [Rails.application.class.module_parent_name.downcase, Rails.env].join('-')
      # ES6 requires a single type per index
@@ -61,14 +70,12 @@ module Elastic
                type: "pattern_capture",
                preserve_original: true,
                patterns: [
-                  "(\\p{Ll}+|\\p{Lu}\\p{Ll}+|\\p{Lu}+)",
+                  ANY_CASE_WORD_PATTERN,
-                  "(\\d+)",
+                  CAMEL_CASE_WORD_PATTERN,
-                  "(?=([\\p{Lu}]+[\\p{L}]+))",
+                  CODE_TOKEN_PATTERN,
-                  '"((?:\\"|[^"]|\\")*)"', # capture terms inside quotes, removing the quotes
+                  DIGIT_PATTERN,
-                  "'((?:\\'|[^']|\\')*)'", # same as above, for single quotes
+                  FILE_NAME_PATTERN,
-                  '\.([^.]+)(?=\.|\s|\Z)', # separate terms on periods
+                  PERIOD_PATTERN
-                  '([\p{L}_.-]+)', # some common chars in file names to keep the whole filename intact (eg. my_file-name.txt)
-                  '([\p{L}\d_]+)' # letters, numbers and underscores are the most common tokens in programming. Always capture them greedily regardless of context.
                ]
              }
            },

--- a/ee/spec/lib/gitlab/elastic/search_results_spec.rb
+++ b/ee/spec/lib/gitlab/elastic/search_results_spec.rb
@@ -701,7 +701,7 @@ RSpec.describe Gitlab::Elastic::SearchResults, :elastic, :sidekiq_might_not_need
        expect(search_for('missing_token_around_equals')).to include(file_name)
      end
-      it 'finds a ruby method name even if preceeded with dot' do
+      it 'finds a ruby method name even if preceded with dot' do
        expect(search_for('ruby_method_name')).to include(file_name)
      end
@@ -709,7 +709,7 @@ RSpec.describe Gitlab::Elastic::SearchResults, :elastic, :sidekiq_might_not_need
        expect(search_for('ruby_method_123')).to include(file_name)
      end
-      it 'finds a ruby method call even if preceeded with dot' do
+      it 'finds a ruby method call even if preceded with dot' do
        expect(search_for('ruby_method_call')).to include(file_name)
      end