Commit 6dab206d authored by Nick Thomas's avatar Nick Thomas

Merge branch 'cached-encoding-detection' into 'master'

Cached binary detection [RUN ALL RSPEC] [RUN AS-IF-FOSS]

See merge request gitlab-org/gitlab!60128
parents 22d64a74 573ec5c4
---
name: cached_encoding_detection
introduced_by_url: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/60128
rollout_issue_url: https://gitlab.com/gitlab-org/gitlab/-/issues/328819
milestone: '13.12'
type: development
group: group::source code
default_enabled: false
......@@ -38,7 +38,7 @@ module Gitlab
# If Charlock says its binary
else
detect_encoding[:type] == :binary
find_encoding[:type] == :binary
end
end
......@@ -137,23 +137,25 @@ module Gitlab
end
def ruby_encoding
if hash = detect_encoding
if hash = find_encoding
hash[:ruby_encoding]
end
end
def encoding
if hash = detect_encoding
if hash = find_encoding
hash[:encoding]
end
end
def detect_encoding
@detect_encoding ||= CharlockHolmes::EncodingDetector.new.detect(data) if data # rubocop:disable Gitlab/ModuleWithInstanceVariables
end
def empty?
data.nil? || data == ""
end
private
def find_encoding
@find_encoding ||= Gitlab::EncodingHelper.detect_encoding(data) if data # rubocop:disable Gitlab/ModuleWithInstanceVariables
end
end
end
......@@ -20,7 +20,7 @@ module Gitlab
return message if message.valid_encoding?
# return message if message type is binary
detect = CharlockHolmes::EncodingDetector.detect(message)
detect = detect_encoding(message)
return message.force_encoding("BINARY") if detect_binary?(message, detect)
if detect && detect[:encoding] && detect[:confidence] > ENCODING_CONFIDENCE_THRESHOLD
......@@ -37,16 +37,30 @@ module Gitlab
"--broken encoding: #{encoding}"
end
def detect_encoding(data, limit: CharlockHolmes::EncodingDetector::DEFAULT_BINARY_SCAN_LEN, cache_key: nil)
return if data.nil?
if Feature.enabled?(:cached_encoding_detection, type: :development, default_enabled: :yaml)
return CharlockHolmes::EncodingDetector.new(limit).detect(data) unless cache_key.present?
Rails.cache.fetch([:detect_binary, CharlockHolmes::VERSION, cache_key], expires_in: 1.week) do
CharlockHolmes::EncodingDetector.new(limit).detect(data)
end
else
CharlockHolmes::EncodingDetector.new(limit).detect(data)
end
end
def detect_binary?(data, detect = nil)
detect ||= CharlockHolmes::EncodingDetector.detect(data)
detect ||= detect_encoding(data)
detect && detect[:type] == :binary && detect[:confidence] == 100
end
def detect_libgit2_binary?(data)
# EncodingDetector checks the first 1024 * 1024 bytes for NUL byte, libgit2 checks
# only the first 8000 (https://github.com/libgit2/libgit2/blob/2ed855a9e8f9af211e7274021c2264e600c0f86b/src/filter.h#L15),
# which is what we use below to keep a consistent behavior.
detect = CharlockHolmes::EncodingDetector.new(8000).detect(data)
# EncodingDetector checks the first 1024 * 1024 bytes for NUL byte, libgit2 checks
# only the first 8000 (https://github.com/libgit2/libgit2/blob/2ed855a9e8f9af211e7274021c2264e600c0f86b/src/filter.h#L15),
# which is what we use below to keep a consistent behavior.
def detect_libgit2_binary?(data, cache_key: nil)
detect = detect_encoding(data, limit: 8000, cache_key: cache_key)
detect && detect[:type] == :binary
end
......@@ -54,7 +68,8 @@ module Gitlab
message = force_encode_utf8(message)
return message if message.valid_encoding?
detect = CharlockHolmes::EncodingDetector.detect(message)
detect = detect_encoding(message)
if detect && detect[:encoding]
begin
CharlockHolmes::Converter.convert(message, detect[:encoding], 'UTF-8')
......
......@@ -110,8 +110,8 @@ module Gitlab
end
end
def binary?(data)
EncodingHelper.detect_libgit2_binary?(data)
def binary?(data, cache_key: nil)
EncodingHelper.detect_libgit2_binary?(data, cache_key: cache_key)
end
def size_could_be_lfs?(size)
......
......@@ -41,7 +41,7 @@ module Gitlab
size: blob_data[:size],
commit_id: blob_data[:revision],
data: data,
binary: Gitlab::Git::Blob.binary?(data)
binary: Gitlab::Git::Blob.binary?(data, cache_key: blob_data[:oid])
)
end
end
......
......@@ -2,7 +2,7 @@
require 'spec_helper'
RSpec.describe 'Diff file viewer', :js do
RSpec.describe 'Diff file viewer', :js, :with_clean_rails_cache do
let(:project) { create(:project, :public, :repository) }
def visit_commit(sha, anchor: nil)
......
......@@ -216,4 +216,63 @@ RSpec.describe Gitlab::EncodingHelper do
expect(test).not_to eq(io_stream)
end
end
describe '#detect_encoding' do
subject { ext_class.detect_encoding(data, **kwargs) }
let(:data) { binary_string }
let(:kwargs) { {} }
shared_examples 'detects encoding' do
it { is_expected.to be_a(Hash) }
it 'correctly detects the binary' do
expect(subject[:type]).to eq(:binary)
end
context 'data is nil' do
let(:data) { nil }
it { is_expected.to be_nil }
end
context 'limit is provided' do
let(:kwargs) do
{ limit: 10 }
end
it 'correctly detects the binary' do
expect(subject[:type]).to eq(:binary)
end
end
end
context 'cached_encoding_detection is enabled' do
before do
stub_feature_flags(cached_encoding_detection: true)
end
it_behaves_like 'detects encoding'
context 'cache_key is provided' do
let(:kwargs) do
{ cache_key: %w(foo bar) }
end
it 'uses that cache_key to serve from the cache' do
expect(Rails.cache).to receive(:fetch).with([:detect_binary, CharlockHolmes::VERSION, %w(foo bar)], expires_in: 1.week).and_call_original
expect(subject[:type]).to eq(:binary)
end
end
end
context 'cached_encoding_detection is disabled' do
before do
stub_feature_flags(cached_encoding_detection: false)
end
it_behaves_like 'detects encoding'
end
end
end
......@@ -53,7 +53,10 @@ RSpec.describe Gitlab::Git::Blame, :seed_helper do
end
it 'converts to UTF-8' do
expect(CharlockHolmes::EncodingDetector).to receive(:detect).and_return(nil)
expect_next_instance_of(CharlockHolmes::EncodingDetector) do |detector|
expect(detector).to receive(:detect).and_return(nil)
end
data = []
blame.each do |commit, line|
data << {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment