table.rb 2.58 KB
Newer Older
1 2
require 'digest'
require 'csv'
3
require 'yaml'
4 5 6 7 8 9 10 11

module Pseudonymity
  class Anon
    def initialize(fields)
      @anon_fields = fields
    end

    def anonymize(results)
12 13 14
      columns = results.columns # Assume they all have the same table
      to_filter = @anon_fields & columns

15 16 17 18 19 20
      Enumerator.new do | yielder |
        results.each do |result|
          to_filter.each do |field|
            result[field] = Digest::SHA2.new(256).hexdigest(result[field]) unless result[field].nil?
          end
          yielder << result
21 22 23 24 25 26
        end
      end
    end
  end

  class Table
27 28
    attr_accessor :config

29
    def initialize
30 31
      @config = {}
      @csv_output = ""
32
      parse_config
33
      @schema = {}
34 35 36
    end

    def tables_to_csv
37 38
      tables = config["tables"]
      @csv_output = config["output"]["csv"]
39 40 41 42
      if not File.directory?(@csv_output)
        puts "No such directory #{@csv_output}"
        return
      end
43
      tables.map do | k, v |
44
        @schema[k] = {}
45
        table_to_csv(k, v["whitelist"], v["pseudo"])
46
      end
47 48 49 50 51 52
      schema_to_yml
    end

    def schema_to_yml
      file_path = "#{@csv_output}/schema_#{Time.now.to_i}.yml"
      File.open(file_path, 'w') { |file| file.write(@schema.to_yaml) }
53
    end
54

55
    def table_to_csv(table, whitelist_columns, pseudonymity_columns)
56 57
      sql = "SELECT #{whitelist_columns.join(",")} FROM #{table};"
      type_sql = "SELECT column_name, data_type FROM information_schema.columns WHERE table_name = '#{table}';"
58
      results = ActiveRecord::Base.connection.exec_query(sql)
59 60
      type_results = ActiveRecord::Base.connection.exec_query(type_sql)
      set_schema_column_types(table, type_results)
61
      return if results.empty?
62

63
      anon = Anon.new(pseudonymity_columns)
64
      write_to_csv_file(table, anon.anonymize(results))
65 66
    end

67 68 69 70
    def set_schema_column_types(table, type_results)
      type_results.each do | type_result |
        @schema[table][type_result["column_name"]] = type_result["data_type"]
      end
Jacob Schatz's avatar
Jacob Schatz committed
71 72
      # hard coded because all mapping keys in GL are id
      @schema[table]["gl_mapping_key"] = "id"
73 74
    end

75 76 77 78 79
    def parse_config
      @config = YAML.load_file('./lib/assets/pseudonymity_dump.yml')
    end

    def write_to_csv_file(title, contents)
80
      file_path = "#{@csv_output}/#{title}_#{Time.now.to_i}.csv"
81 82 83 84 85 86 87 88 89
      column_names = contents.first.keys
      contents = CSV.generate do | csv |
        csv << column_names
        contents.each do |x|
          csv << x.values
        end
      end
      File.open(file_path, 'w') { |file| file.write(contents) }
      return file_path
90
    end
91 92

    private :write_to_csv_file
93 94
  end
end