module Pseudonymizer class Dumper attr_accessor :config, :output_dir def initialize(options) @config = options.config.deep_symbolize_keys @output_dir = options.output_dir @start_at = options.start_at reset! end def reset! @schema = Hash.new { |h, k| h[k] = {} } @output_files = [] end def tables_to_csv return @output_files unless @output_files.empty? tables = config[:tables] FileUtils.mkdir_p(output_dir) unless File.directory?(output_dir) schema_to_yml @output_files = tables.map do |k, v| table_to_csv(k, v[:whitelist], v[:pseudo]) end.compact file_list_to_json @output_files end private def output_filename(basename = nil, ext = "csv.gz") File.join(output_dir, "#{basename}.#{ext}") end def schema_to_yml file_path = output_filename("schema", "yml") File.write(file_path, @schema.to_yaml) end def file_list_to_json file_path = output_filename("file_list", "json") relative_files = @output_files.map(&File.method(:basename)) File.write(file_path, relative_files.to_json) end def table_to_csv(table, whitelist_columns, pseudonymity_columns) table_to_schema(table) write_to_csv_file( table, table_page_results(table, whitelist_columns, pseudonymity_columns) ) rescue => e Rails.logger.error("Failed to export #{table}: #{e}") raise e end # yield every results, pagined, anonymized def table_page_results(table, whitelist_columns, pseudonymity_columns) filter = Filter.new(table, whitelist_columns, pseudonymity_columns) pager = Pager.new(table, whitelist_columns) Enumerator.new do |yielder| pager.pages do |page| filter.anonymize(page).each do |result| yielder << result end end end.lazy end def table_to_schema(table) table_config = @config.dig(:tables, table) type_results = ActiveRecord::Base.connection.columns(table) type_results = type_results.select do |c| table_config[:whitelist].include?(c.name) end type_results = type_results.map do |c| data_type = c.sql_type if table_config[:pseudo].include?(c.name) data_type = "character varying" end { name: c.name, data_type: data_type } end set_schema_column_types(table, type_results) end def set_schema_column_types(table, type_results) type_results.each do |type_result| @schema[table][type_result[:name]] = type_result[:data_type] end # hard coded because all mapping keys in GL are id @schema[table]["gl_mapping_key"] = "id" end def write_to_csv_file(table, contents) file_path = output_filename(table) headers = contents.peek.keys Rails.logger.info "#{self.class.name} writing #{table} to #{file_path}." Zlib::GzipWriter.open(file_path) do |io| csv = CSV.new(io, headers: headers, write_headers: true) contents.each { |row| csv << row.values } end file_path rescue StopIteration Rails.logger.info "#{self.class.name} table #{table} is empty." nil end end end