Commit e804c009 authored by Stan Hu's avatar Stan Hu

Fix Error 500 viewing pipelines with invalid UTF-8 data

https://gitlab.com/gitlab-org/gitlab/-/merge_requests/66423 fixed the
build log view when invalid UTF-8 characters were present, but the
pipeline view would still fail since it would attempt to show a few
lines from the build.

We consolidate the helper method in EncodingHelper and use it
to drop invalid UTF-8 characters.

Relates to https://gitlab.com/gitlab-org/gitlab/-/issues/336683

Changelog: fixed
parent c11a8d2b
......@@ -33,6 +33,8 @@ module Gitlab
Result = Struct.new(:html, :state, :append, :truncated, :offset, :size, :total, keyword_init: true) # rubocop:disable Lint/StructNewOverride
class Converter
include EncodingHelper
def on_0(_)
reset
end
......@@ -256,6 +258,7 @@ module Gitlab
start_offset = @offset
stream.each_line do |line|
line = encode_utf8_no_detect(line)
s = StringScanner.new(line)
until s.eos?
......
......@@ -9,6 +9,8 @@ module Gitlab
# Line::Segment is a portion of a line that has its own style
# and text. Multiple segments make the line content.
class Segment
include EncodingHelper
attr_accessor :text, :style
def initialize(style:)
......@@ -21,19 +23,15 @@ module Gitlab
end
def to_h
{ text: encode_text(text) }.tap do |result|
# Without forcing the encoding to UTF-8 and then replacing
# invalid UTF-8 sequences we can get an error when serializing
# the Hash to JSON.
# Encoding::UndefinedConversionError:
# "\xE2" from ASCII-8BIT to UTF-8
{ text: encode_utf8_no_detect(text) }.tap do |result|
result[:style] = style.to_s if style.set?
end
end
# Without forcing the encoding to UTF-8 and then dropping
# invalid UTF-8 sequences we can get an error when serializing
# the Hash to JSON.
# Encoding::UndefinedConversionError:
# "\xE2" from ASCII-8BIT to UTF-8
def encode_text(text)
text.force_encoding(Encoding::UTF_8).encode(Encoding::UTF_8, invalid: :replace, undef: :replace)
end
end
attr_reader :offset, :sections, :segments, :current_segment,
......
......@@ -64,6 +64,15 @@ module Gitlab
detect && detect[:type] == :binary
end
# This is like encode_utf8 except we skip autodetection of the encoding. We
# assume the data must be interpreted as UTF-8.
def encode_utf8_no_detect(message)
message = force_encode_utf8(message)
return message if message.valid_encoding?
message.encode(Encoding::UTF_8, invalid: :replace, undef: :replace)
end
def encode_utf8(message, replace: "")
message = force_encode_utf8(message)
return message if message.valid_encoding?
......
......@@ -150,6 +150,10 @@ RSpec.describe Gitlab::Ci::Ansi2html do
expect(convert_html("\r\n")).to eq('<span><br/></span>')
end
it 'replaces invalid UTF-8 data' do
expect(convert_html("UTF-8 dashes here: ───\n🐤🐤🐤🐤\xF0\x9F\x90\n")).to eq("<span>UTF-8 dashes here: ───<br/>🐤🐤🐤🐤�<br/></span>")
end
describe "incremental update" do
shared_examples 'stateable converter' do
let(:pass1_stream) { StringIO.new(pre_text) }
......
......@@ -3,6 +3,8 @@
require "spec_helper"
RSpec.describe Gitlab::EncodingHelper do
using RSpec::Parameterized::TableSyntax
let(:ext_class) { Class.new { extend Gitlab::EncodingHelper } }
let(:binary_string) { File.read(Rails.root + "spec/fixtures/dk.png") }
......@@ -90,6 +92,22 @@ RSpec.describe Gitlab::EncodingHelper do
end
end
describe '#encode_utf8_no_detect' do
where(:input, :expected) do
"abcd" | "abcd"
"DzDzDz" | "DzDzDz"
"\xC7\xB2\xC7DzDzDz" | "Dz�DzDzDz"
"🐤🐤🐤🐤\xF0\x9F\x90" | "🐤🐤🐤🐤�"
end
with_them do
it 'drops invalid UTF-8' do
expect(ext_class.encode_utf8_no_detect(input.dup.force_encoding(Encoding::ASCII_8BIT))).to eq(expected)
expect(ext_class.encode_utf8_no_detect(input)).to eq(expected)
end
end
end
describe '#encode_utf8' do
[
["nil", nil, nil],
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment