Commit ba39b26c authored by Sean McGivern's avatar Sean McGivern

Merge branch '35942_api_binary_encoding' into 'master'

API fix for non UTF-8 data

Closes #35942

See merge request !14038
parents 93e1d4dd 46f6092a
---
title: "Fix API to serve binary diffs that are treated as text."
merge_request: 14038
......@@ -104,7 +104,7 @@ module API
not_found! 'Commit' unless commit
commit.raw_diffs.to_a
present commit.raw_diffs.to_a, with: Entities::RepoDiff
end
desc "Get a commit's comments" do
......
......@@ -291,10 +291,11 @@ module API
end
class RepoDiff < Grape::Entity
expose :old_path, :new_path, :a_mode, :b_mode, :diff
expose :old_path, :new_path, :a_mode, :b_mode
expose :new_file?, as: :new_file
expose :renamed_file?, as: :renamed_file
expose :deleted_file?, as: :deleted_file
expose :json_safe_diff, as: :diff
end
class ProtectedRefAccess < Grape::Entity
......
......@@ -22,10 +22,10 @@ module Gitlab
# return message if message type is binary
detect = CharlockHolmes::EncodingDetector.detect(message)
return message.force_encoding("BINARY") if detect && detect[:type] == :binary
return message.force_encoding("BINARY") if detect_binary?(message, detect)
# force detected encoding if we have sufficient confidence.
if detect && detect[:encoding] && detect[:confidence] > ENCODING_CONFIDENCE_THRESHOLD
# force detected encoding if we have sufficient confidence.
message.force_encoding(detect[:encoding])
end
......@@ -36,6 +36,19 @@ module Gitlab
"--broken encoding: #{encoding}"
end
def detect_binary?(data, detect = nil)
detect ||= CharlockHolmes::EncodingDetector.detect(data)
detect && detect[:type] == :binary && detect[:confidence] == 100
end
def detect_libgit2_binary?(data)
# EncodingDetector checks the first 1024 * 1024 bytes for NUL byte, libgit2 checks
# only the first 8000 (https://github.com/libgit2/libgit2/blob/2ed855a9e8f9af211e7274021c2264e600c0f86b/src/filter.h#L15),
# which is what we use below to keep a consistent behavior.
detect = CharlockHolmes::EncodingDetector.new(8000).detect(data)
detect && detect[:type] == :binary
end
def encode_utf8(message)
detect = CharlockHolmes::EncodingDetector.detect(message)
if detect && detect[:encoding]
......
......@@ -42,14 +42,6 @@ module Gitlab
end
end
def binary?(data)
# EncodingDetector checks the first 1024 * 1024 bytes for NUL byte, libgit2 checks
# only the first 8000 (https://github.com/libgit2/libgit2/blob/2ed855a9e8f9af211e7274021c2264e600c0f86b/src/filter.h#L15),
# which is what we use below to keep a consistent behavior.
detect = CharlockHolmes::EncodingDetector.new(8000).detect(data)
detect && detect[:type] == :binary
end
# Returns an array of Blob instances, specified in blob_references as
# [[commit_sha, path], [commit_sha, path], ...]. If blob_size_limit < 0 then the
# full blob contents are returned. If blob_size_limit >= 0 then each blob will
......@@ -65,6 +57,10 @@ module Gitlab
end
end
def binary?(data)
EncodingHelper.detect_libgit2_binary?(data)
end
private
# Recursive search of blob id by path
......
......@@ -116,6 +116,15 @@ module Gitlab
filtered_opts
end
# Return a binary diff message like:
#
# "Binary files a/file/path and b/file/path differ\n"
# This is used when we detect that a diff is binary
# using CharlockHolmes when Rugged treats it as text.
def binary_message(old_path, new_path)
"Binary files #{old_path} and #{new_path} differ\n"
end
end
def initialize(raw_diff, expanded: true)
......@@ -190,6 +199,13 @@ module Gitlab
@collapsed = true
end
def json_safe_diff
return @diff unless detect_binary?(@diff)
# the diff is binary, let's make a message for it
Diff.binary_message(@old_path, @new_path)
end
private
def init_from_rugged(rugged)
......
......@@ -273,6 +273,25 @@ EOT
end
end
describe '#json_safe_diff' do
let(:project) { create(:project, :repository) }
it 'fake binary message when it detects binary' do
# Rugged will not detect this as binary, but we can fake it
diff_message = "Binary files files/images/icn-time-tracking.pdf and files/images/icn-time-tracking.pdf differ\n"
binary_diff = described_class.between(project.repository, 'add-pdf-text-binary', 'add-pdf-text-binary^').first
expect(binary_diff.diff).not_to be_empty
expect(binary_diff.json_safe_diff).to eq(diff_message)
end
it 'leave non-binary diffs as-is' do
diff = described_class.new(@rugged_diff)
expect(diff.json_safe_diff).to eq(diff.diff)
end
end
describe '#submodule?' do
before do
commit = repository.lookup('5937ac0a7beb003549fc5fd26fc247adbce4a52e')
......
......@@ -673,6 +673,12 @@ describe API::Commits do
it_behaves_like 'ref diff'
end
end
context 'when binary diff are treated as text' do
let(:commit_id) { TestEnv::BRANCH_SHA['add-pdf-text-binary'] }
it_behaves_like 'ref diff'
end
end
end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment