Commit 5a01d88c authored by Michael Kozono's avatar Michael Kozono

Add local project uploads cleanup task

parent 640b5db0
---
title: Add local project uploads cleanup task
merge_request: 20863
author:
type: added
......@@ -31,4 +31,35 @@ sudo gitlab-rake gitlab:cleanup:moved
# installation from source
bundle exec rake gitlab:cleanup:moved RAILS_ENV=production
```
Clean up local project upload files if they don't exist in GitLab database. The
task attempts to fix the file if it can find its project, otherwise it moves the
file to a lost and found directory.
```
# omnibus-gitlab
sudo gitlab-rake gitlab:cleanup:project_uploads
# installation from source
bundle exec rake gitlab:cleanup:project_uploads RAILS_ENV=production
```
Example output:
```
$ sudo gitlab-rake gitlab:cleanup:project_uploads
I, [2018-07-27T12:08:27.671559 #89817] INFO -- : Looking for orphaned project uploads to clean up. Dry run...
D, [2018-07-27T12:08:28.293568 #89817] DEBUG -- : Processing batch of 500 project upload file paths, starting with /opt/gitlab/embedded/service/gitlab-rails/public/uploads/test.out
I, [2018-07-27T12:08:28.689869 #89817] INFO -- : Can move to lost and found /opt/gitlab/embedded/service/gitlab-rails/public/uploads/test.out -> /opt/gitlab/embedded/service/gitlab-rails/public/uploads/-/project-lost-found/test.out
I, [2018-07-27T12:08:28.755624 #89817] INFO -- : Can fix /opt/gitlab/embedded/service/gitlab-rails/public/uploads/foo/bar/89a0f7b0b97008a4a18cedccfdcd93fb/foo.txt -> /opt/gitlab/embedded/service/gitlab-rails/public/uploads/qux/foo/bar/89a0f7b0b97008a4a18cedccfdcd93fb/foo.txt
I, [2018-07-27T12:08:28.760257 #89817] INFO -- : Can move to lost and found /opt/gitlab/embedded/service/gitlab-rails/public/uploads/foo/bar/1dd6f0f7eefd2acc4c2233f89a0f7b0b/image.png -> /opt/gitlab/embedded/service/gitlab-rails/public/uploads/-/project-lost-found/foo/bar/1dd6f0f7eefd2acc4c2233f89a0f7b0b/image.png
I, [2018-07-27T12:08:28.764470 #89817] INFO -- : To cleanup these files run this command with DRY_RUN=false
$ sudo gitlab-rake gitlab:cleanup:project_uploads DRY_RUN=false
I, [2018-07-27T12:08:32.944414 #89936] INFO -- : Looking for orphaned project uploads to clean up...
D, [2018-07-27T12:08:33.293568 #89817] DEBUG -- : Processing batch of 500 project upload file paths, starting with /opt/gitlab/embedded/service/gitlab-rails/public/uploads/test.out
I, [2018-07-27T12:08:33.689869 #89817] INFO -- : Did move to lost and found /opt/gitlab/embedded/service/gitlab-rails/public/uploads/test.out -> /opt/gitlab/embedded/service/gitlab-rails/public/uploads/-/project-lost-found/test.out
I, [2018-07-27T12:08:33.755624 #89817] INFO -- : Did fix /opt/gitlab/embedded/service/gitlab-rails/public/uploads/foo/bar/89a0f7b0b97008a4a18cedccfdcd93fb/foo.txt -> /opt/gitlab/embedded/service/gitlab-rails/public/uploads/qux/foo/bar/89a0f7b0b97008a4a18cedccfdcd93fb/foo.txt
I, [2018-07-27T12:08:33.760257 #89817] INFO -- : Did move to lost and found /opt/gitlab/embedded/service/gitlab-rails/public/uploads/foo/bar/1dd6f0f7eefd2acc4c2233f89a0f7b0b/image.png -> /opt/gitlab/embedded/service/gitlab-rails/public/uploads/-/project-lost-found/foo/bar/1dd6f0f7eefd2acc4c2233f89a0f7b0b/image.png
```
# frozen_string_literal: true
module Gitlab
module Cleanup
class ProjectUploadFileFinder
FIND_BATCH_SIZE = 500
ABSOLUTE_UPLOAD_DIR = FileUploader.root.freeze
EXCLUDED_SYSTEM_UPLOADS_PATH = "#{ABSOLUTE_UPLOAD_DIR}/-/*".freeze
EXCLUDED_HASHED_UPLOADS_PATH = "#{ABSOLUTE_UPLOAD_DIR}/@hashed/*".freeze
EXCLUDED_TMP_UPLOADS_PATH = "#{ABSOLUTE_UPLOAD_DIR}/tmp/*".freeze
# Paths are relative to the upload directory
def each_file_batch(batch_size: FIND_BATCH_SIZE, &block)
cmd = build_find_command(ABSOLUTE_UPLOAD_DIR)
Open3.popen2(*cmd) do |stdin, stdout, status_thread|
yield_paths_in_batches(stdout, batch_size, &block)
raise "Find command failed" unless status_thread.value.success?
end
end
private
def yield_paths_in_batches(stdout, batch_size, &block)
paths = []
stdout.each_line("\0") do |line|
paths << line.chomp("\0")
if paths.size >= batch_size
yield(paths)
paths = []
end
end
yield(paths) if paths.any?
end
def build_find_command(search_dir)
cmd = %W[find -L #{search_dir}
-type f
! ( -path #{EXCLUDED_SYSTEM_UPLOADS_PATH} -prune )
! ( -path #{EXCLUDED_HASHED_UPLOADS_PATH} -prune )
! ( -path #{EXCLUDED_TMP_UPLOADS_PATH} -prune )
-print0]
ionice = which_ionice
cmd = %W[#{ionice} -c Idle] + cmd if ionice
log_msg = "find command: \"#{cmd.join(' ')}\""
Rails.logger.info log_msg
cmd
end
def which_ionice
Gitlab::Utils.which('ionice')
rescue StandardError
# In this case, returning false is relatively safe,
# even though it isn't very nice
false
end
end
end
end
# frozen_string_literal: true
module Gitlab
module Cleanup
class ProjectUploads
LOST_AND_FOUND = File.join(ProjectUploadFileFinder::ABSOLUTE_UPLOAD_DIR, '-', 'project-lost-found')
attr_reader :logger
def initialize(logger: nil)
@logger = logger || Rails.logger
end
def run!(dry_run: true)
logger.info "Looking for orphaned project uploads to clean up#{'. Dry run' if dry_run}..."
each_orphan_file do |path, upload_path|
result = cleanup(path, upload_path, dry_run)
logger.info result
end
end
private
def cleanup(path, upload_path, dry_run)
# This happened in staging:
# `find` returned a path on which `File.delete` raised `Errno::ENOENT`
return "Cannot find file: #{path}" unless File.exist?(path)
correct_path = upload_path && find_correct_path(upload_path)
if correct_path
move(path, correct_path, 'fix', dry_run)
else
move_to_lost_and_found(path, dry_run)
end
end
# Accepts a path in the form of "#{hex_secret}/#{filename}"
def find_correct_path(upload_path)
upload = Upload.find_by(uploader: 'FileUploader', path: upload_path)
return unless upload && upload.local?
upload.absolute_path
rescue => e
logger.error e.message
# absolute_path depends on a lot of code. If it doesn't work, then it
# it doesn't matter if the upload file is in the right place. Treat it
# as uncorrectable.
# I.e. the project record might be missing, which raises an exception.
nil
end
def move_to_lost_and_found(path, dry_run)
new_path = path.sub(/\A#{ProjectUploadFileFinder::ABSOLUTE_UPLOAD_DIR}/, LOST_AND_FOUND)
move(path, new_path, 'move to lost and found', dry_run)
end
def move(path, new_path, prefix, dry_run)
action = "#{prefix} #{path} -> #{new_path}"
if dry_run
"Can #{action}"
else
begin
FileUtils.mkdir_p(File.dirname(new_path))
FileUtils.mv(path, new_path)
"Did #{action}"
rescue => e
"Error during #{action}: #{e.inspect}"
end
end
end
# Yields absolute paths of project upload files that are not in the
# uploads table
def each_orphan_file
ProjectUploadFileFinder.new.each_file_batch do |file_paths|
logger.debug "Processing batch of #{file_paths.size} project upload file paths, starting with #{file_paths.first}"
file_paths.each do |path|
pup = ProjectUploadPath.from_path(path)
yield(path, pup.upload_path) if pup.orphan?
end
end
end
class ProjectUploadPath
PROJECT_FULL_PATH_REGEX = %r{\A#{FileUploader.root}/(.+)/(\h+/[^/]+)\z}.freeze
attr_reader :full_path, :upload_path
def initialize(full_path, upload_path)
@full_path = full_path
@upload_path = upload_path
end
def self.from_path(path)
path_matched = path.match(PROJECT_FULL_PATH_REGEX)
return new(nil, nil) unless path_matched
new(path_matched[1], path_matched[2])
end
def orphan?
return true if full_path.nil? || upload_path.nil?
# It's possible to reduce to one query, but `where_full_path_in` is complex
!Upload.exists?(path: upload_path, model_id: project_id, model_type: 'Project', uploader: 'FileUploader')
end
private
def project_id
@project_id ||= Project.where_full_path_in([full_path]).pluck(:id)
end
end
end
end
end
......@@ -7,9 +7,8 @@ namespace :gitlab do
desc "GitLab | Cleanup | Clean namespaces"
task dirs: :gitlab_environment do
warn_user_is_not_gitlab
remove_flag = ENV['REMOVE']
namespaces = Namespace.pluck(:path)
namespaces = Namespace.pluck(:path)
namespaces << HASHED_REPOSITORY_NAME # add so that it will be ignored
Gitlab.config.repositories.storages.each do |name, repository_storage|
git_base_path = Gitlab::GitalyClient::StorageSettings.allow_disk_access { repository_storage.legacy_disk_path }
......@@ -31,8 +30,8 @@ namespace :gitlab do
end
all_dirs.each do |dir_path|
if remove_flag
if FileUtils.rm_rf(dir_path)
if remove?
if FileUtils.rm_rf dir_path
puts "Removed...#{dir_path}".color(:red)
else
puts "Cannot remove #{dir_path}".color(:red)
......@@ -43,7 +42,7 @@ namespace :gitlab do
end
end
unless remove_flag
unless remove?
puts "To cleanup this directories run this command with REMOVE=true".color(:yellow)
end
end
......@@ -134,5 +133,37 @@ namespace :gitlab do
puts "To block these users run this command with BLOCK=true".color(:yellow)
end
end
desc "GitLab | Cleanup | Clean orphaned project uploads"
task project_uploads: :gitlab_environment do
warn_user_is_not_gitlab
cleaner = Gitlab::Cleanup::ProjectUploads.new(logger: logger)
cleaner.run!(dry_run: dry_run?)
if dry_run?
logger.info "To clean up these files run this command with DRY_RUN=false".color(:yellow)
end
end
def remove?
ENV['REMOVE'] == 'true'
end
def dry_run?
ENV['DRY_RUN'] != 'false'
end
def logger
return @logger if defined?(@logger)
@logger = if Rails.env.development? || Rails.env.production?
Logger.new(STDOUT).tap do |stdout_logger|
stdout_logger.extend(ActiveSupport::Logger.broadcast(Rails.logger))
end
else
Rails.logger
end
end
end
end
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment