Optimize scanning for references process

Contributes to https://gitlab.com/gitlab-org/gitlab/-/issues/326323 **Problem** For reference cache generation we convert and scan each node one by one for reference identifiers. This process is slow and consumes lots of memory. **Solution** Scan the whole document directly. That removes many unnecessary transformation operations on nodes. Changelog: performance

Optimize scanning for references process
Contributes to https://gitlab.com/gitlab-org/gitlab/-/issues/326323 **Problem** For reference cache generation we convert and scan each node one by one for reference identifiers. This process is slow and consumes lots of memory. **Solution** Scan the whole document directly. That removes many unnecessary transformation operations on nodes. Changelog: performance
7b6fa267 · Vasilii Iakliushin · f095fc89 · 7b6fa267 · 7b6fa267 · 7b6fa267
Commit 7b6fa267 authored Jul 09, 2021 by Vasilii Iakliushin
5 changed files
--- a/app/models/milestone.rb
+++ b/app/models/milestone.rb
@@ -61,10 +61,38 @@ class Milestone < ApplicationRecord
  end

  def self.reference_pattern
+    if Feature.enabled?(:milestone_reference_pattern, default_enabled: :yaml)
+      new_reference_pattern
+    else
+      old_reference_pattern
+    end
+  end
+
+  def self.new_reference_pattern
+    # NOTE: The iid pattern only matches when all characters on the expression
+    # are digits, so it will match %2 but not %2.1 because that's probably a
+    # milestone name and we want it to be matched as such.
+    @new_reference_pattern ||= %r{
+      (#{Project.reference_pattern})?
+      #{Regexp.escape(reference_prefix)}
+      (?:
+        (?<milestone_iid>
+          \d+(?!\S\w)\b # Integer-based milestone iid, or
+        ) |
+        (?<milestone_name>
+          [^"\s\<]+\b |  # String-based single-word milestone title, or
+          "[^"]+"      # String-based multi-word milestone surrounded in quotes
+        )
+      )
+    }x
+  end
+
+  # Deprecated: https://gitlab.com/gitlab-org/gitlab/-/issues/336268
+  def self.old_reference_pattern
    # NOTE: The iid pattern only matches when all characters on the expression
    # are digits, so it will match %2 but not %2.1 because that's probably a
    # milestone name and we want it to be matched as such.
-    @reference_pattern ||= %r{
+    @old_reference_pattern ||= %r{
      (#{Project.reference_pattern})?
      #{Regexp.escape(reference_prefix)}
      (?:

--- a/config/feature_flags/development/milestone_reference_pattern.yml
+++ b/config/feature_flags/development/milestone_reference_pattern.yml
+---
+name: milestone_reference_pattern
+introduced_by_url: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/65847
+rollout_issue_url: https://gitlab.com/gitlab-org/gitlab/-/issues/336268
+milestone: '14.1'
+type: development
+group: group::source code
+default_enabled: false
--- a/lib/banzai/filter/references/reference_cache.rb
+++ b/lib/banzai/filter/references/reference_cache.rb
@@ -28,20 +28,11 @@ module Banzai
          @references_per_parent[parent_type] ||= begin
            refs = Hash.new { |hash, key| hash[key] = Set.new }

-            nodes.each do |node|
-              prepare_node_for_scan(node).scan(regex) do
-                parent_path = if parent_type == :project
-                                full_project_path($~[:namespace], $~[:project])
-                              else
-                                full_group_path($~[:group])
-                              end
-
-                ident = filter.identifier($~)
-                refs[parent_path] << ident if ident
-              end
+            if Feature.enabled?(:milestone_reference_pattern, default_enabled: :yaml)
+              doc_search(refs)
+            else
+              node_search(nodes, refs)
            end
-
-            refs
          end
        end

@@ -172,6 +163,39 @@ module Banzai

        delegate :project, :group, :parent, :parent_type, to: :filter

+        # Deprecated: https://gitlab.com/gitlab-org/gitlab/-/issues/336268
+        def node_search(nodes, refs)
+          nodes.each do |node|
+            prepare_node_for_scan(node).scan(regex) do
+              parent_path = if parent_type == :project
+                              full_project_path($~[:namespace], $~[:project])
+                            else
+                              full_group_path($~[:group])
+                            end
+
+              ident = filter.identifier($~)
+              refs[parent_path] << ident if ident
+            end
+          end
+
+          refs
+        end
+
+        def doc_search(refs)
+          prepare_doc_for_scan(filter.doc).to_enum(:scan, regex).each do
+            parent_path = if parent_type == :project
+                            full_project_path($~[:namespace], $~[:project])
+                          else
+                            full_group_path($~[:group])
+                          end
+
+            ident = filter.identifier($~)
+            refs[parent_path] << ident if ident
+          end
+
+          refs
+        end
+
        def regex
          strong_memoize(:regex) do
            [
@@ -185,6 +209,13 @@ module Banzai
          Gitlab::SafeRequestStore["banzai_#{parent_type}_refs".to_sym] ||= {}
        end

+        def prepare_doc_for_scan(doc)
+          html = doc.to_html
+
+          filter.requires_unescaping? ? unescape_html_entities(html) : html
+        end
+
+        # Deprecated: https://gitlab.com/gitlab-org/gitlab/-/issues/336268
        def prepare_node_for_scan(node)
          html = node.to_html


--- a/spec/lib/banzai/filter/references/milestone_reference_filter_spec.rb
+++ b/spec/lib/banzai/filter/references/milestone_reference_filter_spec.rb
@@ -92,6 +92,11 @@ RSpec.describe Banzai::Filter::References::MilestoneReferenceFilter do
      expect(doc.to_html).to match(%r(\(<a.+>#{milestone.reference_link_text}</a>\.\)))
    end

+    it 'links with adjacent html tags' do
+      doc = reference_filter("Milestone <p>#{reference}</p>.")
+      expect(doc.to_html).to match(%r(<p><a.+>#{milestone.reference_link_text}</a></p>))
+    end
+
    it 'ignores invalid milestone names' do
      exp = act = "Milestone #{Milestone.reference_prefix}#{milestone.name.reverse}"


--- a/spec/models/milestone_spec.rb
+++ b/spec/models/milestone_spec.rb
@@ -538,6 +538,15 @@ RSpec.describe Milestone do

    it { is_expected.to match('gitlab-org/gitlab-ce%123') }
    it { is_expected.to match('gitlab-org/gitlab-ce%"my-milestone"') }
+
+    context 'when milestone_reference_pattern feature flag is false' do
+      before do
+        stub_feature_flags(milestone_reference_pattern: false)
+      end
+
+      it { is_expected.to match('gitlab-org/gitlab-ce%123') }
+      it { is_expected.to match('gitlab-org/gitlab-ce%"my-milestone"') }
+    end
  end

  describe '.link_reference_pattern' do