Commit 51da5736 authored by Micaël Bergeron's avatar Micaël Bergeron

adds the object storage configuration

parent 0f52db38
class GitlabELTDataDumpWorker class GitlabEltDataDumpWorker
include ApplicationWorker include ApplicationWorker
include CronjobQueue include CronjobQueue
......
...@@ -726,6 +726,21 @@ production: &base ...@@ -726,6 +726,21 @@ production: &base
# # Specifies Amazon S3 storage class to use for backups, this is optional # # Specifies Amazon S3 storage class to use for backups, this is optional
# # storage_class: 'STANDARD' # # storage_class: 'STANDARD'
## Pseudonym exporter
pseudonymizer:
# Tables manifest that specifies the fields to extract and pseudonymize.
# TODO: link to meltano configuration?
manifest: config/pseudonymizer.yml
upload:
# Fog storage connection settings, see http://fog.io/storage/ .
connection:
# provider: AWS
# region: eu-west-1
# aws_access_key_id: AKIAKIAKI
# aws_secret_access_key: 'secret123'
# # The remote 'directory' to store the CSV files. For S3, this would be the bucket name.
# remote_directory: 'gitlab-elt'
## GitLab Shell settings ## GitLab Shell settings
gitlab_shell: gitlab_shell:
path: /home/git/gitlab-shell/ path: /home/git/gitlab-shell/
...@@ -876,6 +891,17 @@ test: ...@@ -876,6 +891,17 @@ test:
token: secret token: secret
backup: backup:
path: tmp/tests/backups path: tmp/tests/backups
pseudonymizer:
manifest: config/pseudonymizer.test.yml
upload:
# The remote 'directory' to store the CSV files. For S3, this would be the bucket name.
remote_directory: gitlab-elt.test
# Fog storage connection settings, see http://fog.io/storage/
connection:
provider: AWS
region: us-east-1
aws_access_key_id: AWS_ACCESS_KEY_ID
aws_secret_access_key: AWS_SECRET_ACCESS_KEY
gitlab_shell: gitlab_shell:
path: tmp/tests/gitlab-shell/ path: tmp/tests/gitlab-shell/
hooks_path: tmp/tests/gitlab-shell/hooks/ hooks_path: tmp/tests/gitlab-shell/hooks/
......
...@@ -373,7 +373,7 @@ Settings.cron_jobs['gitlab_usage_ping_worker']['job_class'] = 'GitlabUsagePingWo ...@@ -373,7 +373,7 @@ Settings.cron_jobs['gitlab_usage_ping_worker']['job_class'] = 'GitlabUsagePingWo
Settings.cron_jobs['gitlab_elt_database_dump'] ||= Settingslogic.new({}) Settings.cron_jobs['gitlab_elt_database_dump'] ||= Settingslogic.new({})
Settings.cron_jobs['gitlab_elt_database_dump']['cron'] ||= '0 23 * * *'; Settings.cron_jobs['gitlab_elt_database_dump']['cron'] ||= '0 23 * * *';
Settings.cron_jobs['gitlab_elt_database_dump']['job_class'] ||= 'GitlabELTDataDumpWorker'; Settings.cron_jobs['gitlab_elt_database_dump']['job_class'] ||= 'GitlabEltDataDumpWorker';
Settings.cron_jobs['schedule_update_user_activity_worker'] ||= Settingslogic.new({}) Settings.cron_jobs['schedule_update_user_activity_worker'] ||= Settingslogic.new({})
Settings.cron_jobs['schedule_update_user_activity_worker']['cron'] ||= '30 0 * * *' Settings.cron_jobs['schedule_update_user_activity_worker']['cron'] ||= '30 0 * * *'
...@@ -475,6 +475,14 @@ Settings.backup['upload']['multipart_chunk_size'] ||= 104857600 ...@@ -475,6 +475,14 @@ Settings.backup['upload']['multipart_chunk_size'] ||= 104857600
Settings.backup['upload']['encryption'] ||= nil Settings.backup['upload']['encryption'] ||= nil
Settings.backup['upload']['storage_class'] ||= nil Settings.backup['upload']['storage_class'] ||= nil
#
# Pseudonymizer
#
Settings['pseudonymizer'] ||= Settingslogic.new({})
Settings.pseudonymizer['manifest'] = Settings.pseudonymizer['manifest'] || "lib/pseudonymity/manifest.yml"
Settings.pseudonymizer['upload'] ||= Settingslogic.new({ 'remote_directory' => nil, 'connection' => nil })
# Settings.pseudonymizer['upload']['multipart_chunk_size'] ||= 104857600
# #
# Git # Git
# #
......
...@@ -156,7 +156,6 @@ tables: ...@@ -156,7 +156,6 @@ tables:
- last_edited_by_id - last_edited_by_id
- discussion_locked - discussion_locked
- closed_at - closed_at
- closed_by_id
pseudo: pseudo:
- id - id
- title - title
...@@ -487,8 +486,6 @@ tables: ...@@ -487,8 +486,6 @@ tables:
- merge_merge_request - merge_merge_request
- failed_pipeline - failed_pipeline
- success_pipeline - success_pipeline
- push_to_merge_request
- issue_due
pseudo: pseudo:
- id - id
- user_id - user_id
...@@ -509,8 +506,6 @@ tables: ...@@ -509,8 +506,6 @@ tables:
- merge_merge_request - merge_merge_request
- failed_pipeline - failed_pipeline
- success_pipeline - success_pipeline
- push_to_merge_request
- issue_due
project_authorizations: project_authorizations:
whitelist: whitelist:
- user_id - user_id
...@@ -535,15 +530,6 @@ tables: ...@@ -535,15 +530,6 @@ tables:
- updated_at - updated_at
- enabled - enabled
- domain - domain
project_ci_cd_settings:
whitelist:
- id
- project_id
- group_runners_enabled
pseudo:
- id
- project_id
- group_runners_enabled
project_custom_attributes: project_custom_attributes:
whitelist: whitelist:
- id - id
...@@ -559,17 +545,6 @@ tables: ...@@ -559,17 +545,6 @@ tables:
- project_id - project_id
- key - key
- value - value
project_deploy_tokens:
whitelist:
- id
- project_id
- deploy_token_id
- created_at
pseudo:
- id
- project_id
- deploy_token_id
- created_at
project_features: project_features:
whitelist: whitelist:
- id - id
...@@ -750,7 +725,6 @@ tables: ...@@ -750,7 +725,6 @@ tables:
- mirror_overwrites_diverged_branches - mirror_overwrites_diverged_branches
- external_authorization_classification_label - external_authorization_classification_label
- external_webhook_token - external_webhook_token
- pages_https_only
pseudo: pseudo:
- id - id
- name - name
...@@ -820,7 +794,6 @@ tables: ...@@ -820,7 +794,6 @@ tables:
- mirror_overwrites_diverged_branches - mirror_overwrites_diverged_branches
- external_authorization_classification_label - external_authorization_classification_label
- external_webhook_token - external_webhook_token
- pages_https_only
subscriptions: subscriptions:
whitelist: whitelist:
- id - id
......
...@@ -27,27 +27,26 @@ module Pseudonymity ...@@ -27,27 +27,26 @@ module Pseudonymity
class Table class Table
attr_accessor :config attr_accessor :config
attr_accessor :output_dir
def initialize def initialize
@config = {} @config = parse_config
@csv_output = "" @output_dir = ""
parse_config
@schema = {} @schema = {}
@output_files = [] @output_files = []
end end
def tables_to_csv def tables_to_csv
tables = config["tables"] tables = config["tables"]
@csv_output = config["output"]["csv"]
unless File.directory?(@csv_output) @output_dir = File.join("/tmp/", SecureRandom.hex)
raise "No such directory #{@csv_output}" Dir.mkdir(@output_dir) unless File.directory?(@output_dir)
end
new_tables = tables.map do |k, v| new_tables = tables.map do |k, v|
@schema[k] = {} @schema[k] = {}
table_to_csv(k, v["whitelist"], v["pseudo"]) table_to_csv(k, v["whitelist"], v["pseudo"])
end end
schema_to_yml schema_to_yml
file_list_to_json file_list_to_json
new_tables new_tables
...@@ -57,7 +56,7 @@ module Pseudonymity ...@@ -57,7 +56,7 @@ module Pseudonymity
file_timestamp = filename || "#{prefix}_#{Time.now.to_i}" file_timestamp = filename || "#{prefix}_#{Time.now.to_i}"
file_timestamp = "#{file_timestamp}.#{ext}" file_timestamp = "#{file_timestamp}.#{ext}"
@output_files << file_timestamp @output_files << file_timestamp
File.join(@csv_output, file_timestamp) File.join(@output_dir, file_timestamp)
end end
def schema_to_yml def schema_to_yml
...@@ -103,10 +102,11 @@ module Pseudonymity ...@@ -103,10 +102,11 @@ module Pseudonymity
end end
def parse_config def parse_config
@config = YAML.load_file(Rails.root.join('./ee/lib/assets/pseudonymity_dump.yml')) YAML.load_file(Rails.root.join(Gitlab.config.pseudonymizer.manifest))
end end
def write_to_csv_file(title, contents) def write_to_csv_file(title, contents)
Rails.logger.info "Writing #{title} ..."
file_path = get_and_log_file_name("csv", title) file_path = get_and_log_file_name("csv", title)
column_names = contents.first.keys column_names = contents.first.keys
contents = CSV.generate do |csv| contents = CSV.generate do |csv|
......
module Pseudonymity
class UploadService
RemoteStorageUnavailableError = Class.new(StandardError)
def initialize(output_dir, progress)
@progress = progress
@output_dir = output_dir
end
def upload
progress.print "Uploading backup archive to remote storage #{remote_directory} ... "
file_list.each do |file|
upload_file(file, remote_directory)
end
end
def upload_file(file, directory)
progress.print "\tUploading #{file} ... "
if directory.files.create(key: File.basename(file), body: File.open(file), public: false)
progress.puts "done".color(:green)
else
puts "uploading CSV to #{remote_directory} failed".color(:red)
end
end
def cleanup
progress.print "Deleting tmp directory #{@output_dir} ... "
return unless File.exist?(@output_dir)
if FileUtils.rm_rf(@output_dir)
progress.puts "done".color(:green)
else
progress.puts "failed".color(:red)
end
end
private
def config
Gitlab.config.pseudonymizer
end
def remote_directory
connection_settings = config.upload.connection
if connection_settings.blank?
progress.puts "Cannot upload files, make sure the `pseudonimizer.upload.connection` is set properly".color(:red)
raise RemoteStorageUnavailableError.new(connection_settings)
end
connect_to_remote_directory(connection_settings)
end
def connect_to_remote_directory(connection_settings)
# our settings use string keys, but Fog expects symbols
connection = ::Fog::Storage.new(connection_settings.symbolize_keys)
remote_dir = config.upload.remote_directory
# We only attempt to create the directory for local backups. For AWS
# and other cloud providers, we cannot guarantee the user will have
# permission to create the bucket.
if connection.service == ::Fog::Storage::Local
connection.directories.create(key: remote_dir)
else
connection.directories.get(remote_dir)
end
end
def file_list
Dir[File.join(@output_dir, "*.{csv,yml}")]
end
end
end
...@@ -78,6 +78,21 @@ namespace :gitlab do ...@@ -78,6 +78,21 @@ namespace :gitlab do
task pseudonymity_dump: :environment do task pseudonymity_dump: :environment do
table = Pseudonymity::Table.new table = Pseudonymity::Table.new
table.tables_to_csv table.tables_to_csv
upload = Pseudonymity::UploadService.new(table.output_dir, progress)
upload.upload
upload.cleanup
end
def progress
if ENV['CRON']
# We need an object we can say 'puts' and 'print' to; let's use a
# StringIO.
require 'stringio'
StringIO.new
else
$stdout
end
end end
end end
end end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment