Commit 4eba0293 authored by Bob Van Landuyt's avatar Bob Van Landuyt

Merge branch '36628-create-a-rake-task-to-cleanup-unused-lfs-files' into 'master'

Create a rake task to cleanup unused LFS files

Closes #36628

See merge request gitlab-org/gitlab!21747
parents 6327fdde 9a52babc
# frozen_string_literal: true
class LfsObjectsProject < ApplicationRecord
include ::EachBatch
belongs_to :project
belongs_to :lfs_object
......
---
title: Create a rake task to cleanup unused LFS files
merge_request: 21747
author:
type: added
# Cleanup
## Remove unreferenced LFS files from filesystem
DANGER: **Danger:**
Do not run this within 12 hours of a GitLab upgrade. This is to ensure that all background migrations have finished, which otherwise may lead to data loss.
When you remove LFS files from a repository's history, they become orphaned and continue to consume disk space. With this rake task, you can remove invalid references from the database, which
will allow garbage collection of LFS files.
For example:
```shell
# omnibus-gitlab
sudo gitlab-rake gitlab:cleanup:orphan_lfs_file_references PROJECT_PATH="gitlab-org/gitlab-foss"
# installation from source
bundle exec rake gitlab:cleanup:orphan_lfs_file_references RAILS_ENV=production PROJECT_PATH="gitlab-org/gitlab-foss"
```
You can also specify the project with `PROJECT_ID` instead of `PROJECT_PATH`.
For example:
```shell
$ sudo gitlab-rake gitlab:cleanup:orphan_lfs_file_references PROJECT_PATH="gitlab-org/gitlab-foss"
I, [2019-12-13T16:35:31.764962 #82356] INFO -- : Looking for orphan LFS files for project GitLab Org / GitLab Foss
I, [2019-12-13T16:35:31.923659 #82356] INFO -- : Removed invalid references: 12
```
By default, this task does not delete anything but shows how many file references it can
delete. Run the command with `DRY_RUN=false` if you actually want to
delete the references. You can also use `LIMIT={number}` parameter to limit the number of deleted references.
Note that this rake task only removes the references to LFS files. Unreferenced LFS files will be garbage-collected
later (once a day). If you need to garbage collect them immediately, run
`rake gitlab:cleanup:orphan_lfs_files` described below.
## Remove unreferenced LFS files
Unreferenced LFS files are removed on a daily basis but you can remove them immediately if
you need to. For example:
```shell
# omnibus-gitlab
sudo gitlab-rake gitlab:cleanup:orphan_lfs_files
# installation from source
bundle exec rake gitlab:cleanup:orphan_lfs_files
```
Example output:
```shell
$ sudo gitlab-rake gitlab:cleanup:orphan_lfs_files
I, [2020-01-08T20:51:17.148765 #43765] INFO -- : Removed unreferenced LFS files: 12
```
## Remove garbage from filesystem
Clean up local project upload files if they don't exist in GitLab database. The
......
# frozen_string_literal: true
require 'spec_helper'
describe Gitlab::Cleanup::OrphanLfsFileReferences do
let(:null_logger) { Logger.new('/dev/null') }
let(:project) { create(:project, :repository, lfs_enabled: true) }
let(:lfs_object) { create(:lfs_object) }
let!(:invalid_reference) { create(:lfs_objects_project, project: project, lfs_object: lfs_object) }
before do
allow(null_logger).to receive(:info)
allow(Gitlab.config.lfs).to receive(:enabled).and_return(true)
# Create a valid reference
oid = project.repository.gitaly_blob_client.get_all_lfs_pointers(nil).first.lfs_oid
lfs_object2 = create(:lfs_object, oid: oid)
create(:lfs_objects_project, project: project, lfs_object: lfs_object2)
end
context 'dry run' do
it 'prints messages and does not delete references' do
expect(null_logger).to receive(:info).with("[DRY RUN] Looking for orphan LFS files for project #{project.name_with_namespace}")
expect(null_logger).to receive(:info).with("[DRY RUN] Found invalid references: 1")
expect { described_class.new(project, logger: null_logger).run! }
.not_to change { project.lfs_objects.count }
end
end
context 'regular run' do
it 'prints messages and deletes invalid reference' do
expect(null_logger).to receive(:info).with("Looking for orphan LFS files for project #{project.name_with_namespace}")
expect(null_logger).to receive(:info).with("Removed invalid references: 1")
expect { described_class.new(project, logger: null_logger, dry_run: false).run! }
.to change { project.lfs_objects.count }.from(2).to(1)
expect(LfsObjectsProject.exists?(invalid_reference.id)).to be_falsey
end
end
end
# frozen_string_literal: true
module Gitlab
module Cleanup
class OrphanLfsFileReferences
include Gitlab::Utils::StrongMemoize
attr_reader :project, :dry_run, :logger, :limit
DEFAULT_REMOVAL_LIMIT = 1000
def initialize(project, dry_run: true, logger: nil, limit: nil)
@project = project
@dry_run = dry_run
@logger = logger || Rails.logger # rubocop:disable Gitlab/RailsLogger
@limit = limit
end
def run!
log_info("Looking for orphan LFS files for project #{project.name_with_namespace}")
remove_orphan_references
end
private
def remove_orphan_references
invalid_references = project.lfs_objects_projects.where(lfs_object: orphan_objects) # rubocop:disable CodeReuse/ActiveRecord
if dry_run
log_info("Found invalid references: #{invalid_references.count}")
else
count = 0
invalid_references.each_batch(of: limit || DEFAULT_REMOVAL_LIMIT) do |relation|
count += relation.delete_all
end
log_info("Removed invalid references: #{count}")
end
end
def lfs_oids_from_repository
project.repository.gitaly_blob_client.get_all_lfs_pointers(nil).map(&:lfs_oid)
end
def orphan_oids
lfs_oids_from_database - lfs_oids_from_repository
end
def lfs_oids_from_database
oids = []
project.lfs_objects.each_batch do |relation|
oids += relation.pluck(:oid) # rubocop:disable CodeReuse/ActiveRecord
end
oids
end
def orphan_objects
LfsObject.where(oid: orphan_oids) # rubocop:disable CodeReuse/ActiveRecord
end
def log_info(msg)
logger.info("#{'[DRY RUN] ' if dry_run}#{msg}")
end
end
end
end
......@@ -64,6 +64,40 @@ namespace :gitlab do
end
end
desc 'GitLab | Cleanup | Clean orphan LFS file references'
task orphan_lfs_file_references: :gitlab_environment do
warn_user_is_not_gitlab
project = find_project
unless project
logger.info "Specify the project with PROJECT_ID={number} or PROJECT_PATH={namespace/project-name}".color(:red)
exit
end
cleaner = Gitlab::Cleanup::OrphanLfsFileReferences.new(
project,
dry_run: dry_run?,
logger: logger,
limit: limit
)
cleaner.run!
if dry_run?
logger.info "To clean up these files run this command with DRY_RUN=false".color(:yellow)
end
end
desc 'GitLab | Cleanup | Clean orphan LFS files'
task orphan_lfs_files: :gitlab_environment do
warn_user_is_not_gitlab
removed_files = RemoveUnreferencedLfsObjectsWorker.new.perform
logger.info "Removed unreferenced LFS files: #{removed_files.count}".color(:green)
end
namespace :sessions do
desc "GitLab | Cleanup | Sessions | Clean ActiveSession lookup keys"
task active_sessions_lookup_keys: :gitlab_environment do
......@@ -136,6 +170,14 @@ namespace :gitlab do
ENV['NICENESS'].presence
end
def find_project
if ENV['PROJECT_ID']
Project.find_by_id(ENV['PROJECT_ID']&.to_i)
elsif ENV['PROJECT_PATH']
Project.find_by_full_path(ENV['PROJECT_PATH'])
end
end
# rubocop:disable Gitlab/RailsLogger
def logger
return @logger if defined?(@logger)
......
......@@ -120,6 +120,71 @@ describe 'gitlab:cleanup rake tasks' do
end
end
describe 'gitlab:cleanup:orphan_lfs_file_references' do
subject(:rake_task) { run_rake_task('gitlab:cleanup:orphan_lfs_file_references') }
let(:project) { create(:project, :repository) }
before do
stub_env('PROJECT_ID', project.id)
end
it 'runs the task without errors' do
expect(Gitlab::Cleanup::OrphanLfsFileReferences)
.to receive(:new).and_call_original
expect { rake_task }.not_to raise_error
end
context 'with DRY_RUN set to false' do
before do
stub_env('DRY_RUN', 'false')
end
it 'passes dry_run correctly' do
expect(Gitlab::Cleanup::OrphanLfsFileReferences)
.to receive(:new)
.with(project,
limit: anything,
dry_run: false,
logger: anything)
.and_call_original
rake_task
end
end
context 'with LIMIT set to 100' do
before do
stub_env('LIMIT', '100')
end
it 'passes limit as integer' do
expect(Gitlab::Cleanup::OrphanLfsFileReferences)
.to receive(:new)
.with(project,
limit: 100,
dry_run: true,
logger: anything)
.and_call_original
rake_task
end
end
end
describe 'gitlab:cleanup:orphan_lfs_files' do
subject(:rake_task) { run_rake_task('gitlab:cleanup:orphan_lfs_files') }
it 'runs RemoveUnreferencedLfsObjectsWorker' do
expect_any_instance_of(RemoveUnreferencedLfsObjectsWorker)
.to receive(:perform)
.and_call_original
rake_task
end
end
context 'sessions' do
describe 'gitlab:cleanup:sessions:active_sessions_lookup_keys', :clean_gitlab_redis_shared_state do
subject(:rake_task) { run_rake_task('gitlab:cleanup:sessions:active_sessions_lookup_keys') }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment