Commit 520866a0 authored by Alejandro Rodríguez's avatar Alejandro Rodríguez

Avoind unnecesary `force_encoding` operations

They're costly. This will also avoid some edge cases where
charlock_holmes assigns a weird encoding to a perfectly valid UTF-8
string.
parent 00c15cc2
...@@ -14,9 +14,9 @@ module Gitlab ...@@ -14,9 +14,9 @@ module Gitlab
ENCODING_CONFIDENCE_THRESHOLD = 50 ENCODING_CONFIDENCE_THRESHOLD = 50
def encode!(message) def encode!(message)
return nil unless message.respond_to? :force_encoding return nil unless message.respond_to?(:force_encoding)
return message if message.encoding == Encoding::UTF_8 && message.valid_encoding?
# if message is utf-8 encoding, just return it
message.force_encoding("UTF-8") message.force_encoding("UTF-8")
return message if message.valid_encoding? return message if message.valid_encoding?
...@@ -50,6 +50,9 @@ module Gitlab ...@@ -50,6 +50,9 @@ module Gitlab
end end
def encode_utf8(message) def encode_utf8(message)
return nil unless message.is_a?(String)
return message if message.encoding == Encoding::UTF_8 && message.valid_encoding?
detect = CharlockHolmes::EncodingDetector.detect(message) detect = CharlockHolmes::EncodingDetector.detect(message)
if detect && detect[:encoding] if detect && detect[:encoding]
begin begin
......
...@@ -6,6 +6,9 @@ describe Gitlab::EncodingHelper do ...@@ -6,6 +6,9 @@ describe Gitlab::EncodingHelper do
describe '#encode!' do describe '#encode!' do
[ [
["nil", nil, nil],
["empty string", "".encode("ASCII-8BIT"), "".encode("UTF-8")],
["invalid utf-8 encoded string", "my bad string\xE5".force_encoding("UTF-8"), "my bad string"],
[ [
'leaves ascii only string as is', 'leaves ascii only string as is',
'ascii only string', 'ascii only string',
...@@ -81,6 +84,9 @@ describe Gitlab::EncodingHelper do ...@@ -81,6 +84,9 @@ describe Gitlab::EncodingHelper do
describe '#encode_utf8' do describe '#encode_utf8' do
[ [
["nil", nil, nil],
["empty string", "".encode("ASCII-8BIT"), "".encode("UTF-8")],
["invalid utf-8 encoded string", "my bad string\xE5".force_encoding("UTF-8"), "my bad stringå"],
[ [
"encodes valid utf8 encoded string to utf8", "encodes valid utf8 encoded string to utf8",
"λ, λ, λ".encode("UTF-8"), "λ, λ, λ".encode("UTF-8"),
...@@ -95,12 +101,18 @@ describe Gitlab::EncodingHelper do ...@@ -95,12 +101,18 @@ describe Gitlab::EncodingHelper do
"encodes valid ISO-8859-1 encoded string to utf8", "encodes valid ISO-8859-1 encoded string to utf8",
"Rüby ist eine Programmiersprache. Wir verlängern den text damit ICU die Sprache erkennen kann.".encode("ISO-8859-1", "UTF-8"), "Rüby ist eine Programmiersprache. Wir verlängern den text damit ICU die Sprache erkennen kann.".encode("ISO-8859-1", "UTF-8"),
"Rüby ist eine Programmiersprache. Wir verlängern den text damit ICU die Sprache erkennen kann.".encode("UTF-8") "Rüby ist eine Programmiersprache. Wir verlängern den text damit ICU die Sprache erkennen kann.".encode("UTF-8")
],
[
# Test case from https://gitlab.com/gitlab-org/gitlab-ce/issues/39227
"Equifax branch name",
"refs/heads/Equifax".encode("UTF-8"),
"refs/heads/Equifax".encode("UTF-8")
] ]
].each do |description, test_string, xpect| ].each do |description, test_string, xpect|
it description do it description do
r = ext_class.encode_utf8(test_string.force_encoding('UTF-8')) r = ext_class.encode_utf8(test_string)
expect(r).to eq(xpect) expect(r).to eq(xpect)
expect(r.encoding.name).to eq('UTF-8') expect(r.encoding.name).to eq('UTF-8') if xpect
end end
end end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment