Avoid rescheduling the same project again in a backfill condition

parent ec67a370
......@@ -39,7 +39,11 @@ module Geo
def schedule_job(project_id)
job_id = Geo::ProjectSyncWorker.perform_async(project_id, Time.now)
{ id: project_id, job_id: job_id } if job_id
{ project_id: project_id, job_id: job_id } if job_id
end
def scheduled_project_ids
scheduled_jobs.map { |data| data[:project_id] }
end
def finder
......@@ -59,12 +63,14 @@ module Geo
def find_project_ids_not_synced(batch_size:)
shard_restriction(finder.find_unsynced_projects(batch_size: batch_size))
.where.not(id: scheduled_project_ids)
.reorder(last_repository_updated_at: :desc)
.pluck(:id)
end
def find_project_ids_updated_recently(batch_size:)
shard_restriction(finder.find_projects_updated_recently(batch_size: batch_size))
.where.not(id: scheduled_project_ids)
.order('project_registry.last_repository_synced_at ASC NULLS FIRST, projects.last_repository_updated_at ASC')
.pluck(:id)
end
......
......@@ -145,7 +145,7 @@ module Geo
status = Gitlab::SidekiqStatus.job_status(scheduled_job_ids)
# SidekiqStatus returns an array of booleans: true if the job is still running, false otherwise.
# For each entry, first use `zip` to make { job_id: 123, id: 10 } -> [ { job_id: 123, id: 10 }, bool ]
# For each entry, first use `zip` to make { job_id: 123 } -> [ { job_id: 123 }, bool ]
# Next, filter out the jobs that have completed.
@scheduled_jobs = @scheduled_jobs.zip(status).map { |(job, running)| job if running }.compact
end
......@@ -160,12 +160,7 @@ module Geo
num_to_schedule = 0 if num_to_schedule < 0
to_schedule = pending_resources.shift(num_to_schedule)
scheduled = to_schedule.map do |args|
job = schedule_job(*args)
job if job&.fetch(:job_id, nil).present?
end.compact
scheduled = to_schedule.map { |args| schedule_job(*args) }.compact
scheduled_jobs.concat(scheduled)
log_info("Loop #{loops}", enqueued: scheduled.length, pending: pending_resources.length, scheduled: scheduled_jobs.length, capacity: capacity)
......
......@@ -64,6 +64,18 @@ describe Geo::RepositoryShardSyncWorker, :geo, :delete, :clean_gitlab_redis_cach
subject.perform(shard_name)
end
it 'does not schedule a job twice for the same project' do
scheduled_jobs = [
{ job_id: 1, project_id: unsynced_project.id },
{ job_id: 2, project_id: unsynced_project_in_restricted_group.id }
]
is_expected.to receive(:scheduled_jobs).and_return(scheduled_jobs).at_least(:once)
is_expected.not_to receive(:schedule_job)
Sidekiq::Testing.inline! { subject.perform(shard_name) }
end
it 'does not perform Geo::ProjectSyncWorker when no geo database is configured' do
allow(Gitlab::Geo).to receive(:geo_database_configured?) { false }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment