prepare_untracked_uploads.rb 5.67 KB
Newer Older
Michael Kozono's avatar
Michael Kozono committed
1 2
# frozen_string_literal: true

3 4
module Gitlab
  module BackgroundMigration
Michael Kozono's avatar
Michael Kozono committed
5 6 7
    # This class finds all non-hashed uploaded file paths and saves them to a
    # `untracked_files_for_uploads` table.
    class PrepareUntrackedUploads # rubocop:disable Metrics/ClassLength
8 9
      # For bulk_queue_background_migration_jobs_by_range
      include Database::MigrationHelpers
10
      include ::Gitlab::Utils::StrongMemoize
11

Michael Kozono's avatar
Michael Kozono committed
12
      FIND_BATCH_SIZE = 500
13
      RELATIVE_UPLOAD_DIR = "uploads".freeze
14 15 16 17
      ABSOLUTE_UPLOAD_DIR = File.join(
        Gitlab.config.uploads.storage_path,
        RELATIVE_UPLOAD_DIR
      )
Michael Kozono's avatar
Michael Kozono committed
18
      FOLLOW_UP_MIGRATION = 'PopulateUntrackedUploads'.freeze
19
      START_WITH_ROOT_REGEX = %r{\A#{Gitlab.config.uploads.storage_path}/}
20 21
      EXCLUDED_HASHED_UPLOADS_PATH = "#{ABSOLUTE_UPLOAD_DIR}/@hashed/*".freeze
      EXCLUDED_TMP_UPLOADS_PATH = "#{ABSOLUTE_UPLOAD_DIR}/tmp/*".freeze
22

Michael Kozono's avatar
Michael Kozono committed
23 24
      # This class is used to iterate over batches of
      # `untracked_files_for_uploads` rows.
25
      class UntrackedFile < ActiveRecord::Base
26 27
        include EachBatch

28
        self.table_name = 'untracked_files_for_uploads'
29 30 31
      end

      def perform
32
        ensure_temporary_tracking_table_exists
Michael Kozono's avatar
Michael Kozono committed
33 34 35 36 37 38 39 40 41 42 43

        # Since Postgres < 9.5 does not have ON CONFLICT DO NOTHING, and since
        # doing inserts-if-not-exists without ON CONFLICT DO NOTHING would be
        # slow, start with an empty table for Postgres < 9.5.
        # That way we can do bulk inserts at ~30x the speed of individual
        # inserts (~20 minutes worth of inserts at GitLab.com scale instead of
        # ~10 hours).
        # In all other cases, installations will get both bulk inserts and the
        # ability for these jobs to retry without having to clear and reinsert.
        clear_untracked_file_paths unless can_bulk_insert_and_ignore_duplicates?

44
        store_untracked_file_paths
Michael Kozono's avatar
Michael Kozono committed
45

46 47 48 49 50
        if UntrackedFile.all.empty?
          drop_temp_table
        else
          schedule_populate_untracked_uploads_jobs
        end
51 52 53 54
      end

      private

55
      def ensure_temporary_tracking_table_exists
Michael Kozono's avatar
Michael Kozono committed
56
        table_name = :untracked_files_for_uploads
57 58

        unless ActiveRecord::Base.connection.data_source_exists?(table_name)
Michael Kozono's avatar
Michael Kozono committed
59
          UntrackedFile.connection.create_table table_name do |t|
60
            t.string :path, limit: 600, null: false
Michael Kozono's avatar
Michael Kozono committed
61
            t.index :path, unique: true
62 63
          end
        end
64 65
      end

Michael Kozono's avatar
Michael Kozono committed
66 67 68 69
      def clear_untracked_file_paths
        UntrackedFile.delete_all
      end

70
      def store_untracked_file_paths
71
        return unless Dir.exist?(ABSOLUTE_UPLOAD_DIR)
72

Michael Kozono's avatar
Michael Kozono committed
73
        each_file_batch(ABSOLUTE_UPLOAD_DIR, FIND_BATCH_SIZE) do |file_paths|
74
          insert_file_paths(file_paths)
75 76 77
        end
      end

78
      def each_file_batch(search_dir, batch_size, &block)
79
        cmd = build_find_command(search_dir)
80

81
        Open3.popen2(*cmd) do |stdin, stdout, status_thread|
82 83
          yield_paths_in_batches(stdout, batch_size, &block)

84 85 86 87
          raise "Find command failed" unless status_thread.value.success?
        end
      end

88 89 90 91
      def yield_paths_in_batches(stdout, batch_size, &block)
        paths = []

        stdout.each_line("\0") do |line|
92
          paths << line.chomp("\0").sub(START_WITH_ROOT_REGEX, '')
93 94 95 96 97 98 99

          if paths.size >= batch_size
            yield(paths)
            paths = []
          end
        end

100
        yield(paths) if paths.any?
101 102
      end

103
      def build_find_command(search_dir)
Michael Kozono's avatar
Michael Kozono committed
104
        cmd = %W[find -L #{search_dir}
Michael Kozono's avatar
Michael Kozono committed
105 106 107 108
                 -type f
                 ! ( -path #{EXCLUDED_HASHED_UPLOADS_PATH} -prune )
                 ! ( -path #{EXCLUDED_TMP_UPLOADS_PATH} -prune )
                 -print0]
109

Michael Kozono's avatar
Michael Kozono committed
110 111
        ionice = which_ionice
        cmd = %W[#{ionice} -c Idle] + cmd if ionice
112

Michael Kozono's avatar
Michael Kozono committed
113 114
        log_msg = "PrepareUntrackedUploads find command: \"#{cmd.join(' ')}\""
        Rails.logger.info log_msg
Michael Kozono's avatar
Michael Kozono committed
115

116 117 118
        cmd
      end

Michael Kozono's avatar
Michael Kozono committed
119
      def which_ionice
120 121
        Gitlab::Utils.which('ionice')
      rescue StandardError
Michael Kozono's avatar
Michael Kozono committed
122 123
        # In this case, returning false is relatively safe,
        # even though it isn't very nice
124 125 126 127
        false
      end

      def insert_file_paths(file_paths)
Michael Kozono's avatar
Michael Kozono committed
128
        sql = insert_sql(file_paths)
129

Michael Kozono's avatar
Michael Kozono committed
130 131
        ActiveRecord::Base.connection.execute(sql)
      end
132

Michael Kozono's avatar
Michael Kozono committed
133 134 135 136 137 138 139 140 141 142 143 144
      def insert_sql(file_paths)
        if postgresql_pre_9_5?
          "INSERT INTO #{table_columns_and_values_for_insert(file_paths)};"
        elsif postgresql?
          "INSERT INTO #{table_columns_and_values_for_insert(file_paths)}"\
            " ON CONFLICT DO NOTHING;"
        else # MySQL
          "INSERT IGNORE INTO"\
            " #{table_columns_and_values_for_insert(file_paths)};"
        end
      end

Michael Kozono's avatar
Michael Kozono committed
145 146
      def table_columns_and_values_for_insert(file_paths)
        values = file_paths.map do |file_path|
Lin Jen-Shin's avatar
Lin Jen-Shin committed
147
          ActiveRecord::Base.send(:sanitize_sql_array, ['(?)', file_path]) # rubocop:disable GitlabSecurity/PublicSend
Michael Kozono's avatar
Michael Kozono committed
148
        end.join(', ')
149

150
        "#{UntrackedFile.table_name} (path) VALUES #{values}"
151 152
      end

153
      def postgresql?
154 155 156
        strong_memoize(:postgresql) do
          Gitlab::Database.postgresql?
        end
157 158
      end

Michael Kozono's avatar
Michael Kozono committed
159 160 161 162
      def can_bulk_insert_and_ignore_duplicates?
        !postgresql_pre_9_5?
      end

163
      def postgresql_pre_9_5?
164 165 166
        strong_memoize(:postgresql_pre_9_5) do
          postgresql? && Gitlab::Database.version.to_f < 9.5
        end
167 168
      end

169
      def schedule_populate_untracked_uploads_jobs
Michael Kozono's avatar
Michael Kozono committed
170 171
        bulk_queue_background_migration_jobs_by_range(
          UntrackedFile, FOLLOW_UP_MIGRATION)
172
      end
173 174

      def drop_temp_table
175 176 177 178
        unless Rails.env.test? # Dropping a table intermittently breaks test cleanup
          UntrackedFile.connection.drop_table(:untracked_files_for_uploads,
                                              if_exists: true)
        end
179
      end
180 181 182
    end
  end
end