Commit 2d08a15f authored by Stan Hu's avatar Stan Hu

Merge branch 'feature/improve-geo-logging' into 'master'

Improve Geo logging of repository errors

Closes #2968

See merge request gitlab-org/gitlab-ee!3402
parents 091d1f97 38ec49a2
......@@ -4,6 +4,8 @@ class Geo::ProjectRegistry < Geo::BaseRegistry
validates :project, presence: true, uniqueness: true
scope :dirty, -> { where(arel_table[:resync_repository].eq(true).or(arel_table[:resync_wiki].eq(true))) }
scope :failed_repos, -> { where(arel_table[:repository_retry_count].gt(0)) }
scope :failed_wikis, -> { where(arel_table[:wiki_retry_count].gt(0)) }
def self.failed
repository_sync_failed = arel_table[:repository_retry_count].gt(0)
......
......@@ -166,6 +166,17 @@ class GeoNode < ActiveRecord::Base
end
end
def filtered_project_registries(type = nil)
case type
when 'repository'
project_registries.failed_repos
when 'wiki'
project_registries.failed_wikis
else
project_registries.failed
end
end
def uploads
if restricted_project_ids
uploads_table = Upload.arel_table
......
class GeoProjectRegistryEntity < Grape::Entity
expose :project_id
expose :last_repository_synced_at
expose :last_repository_successful_sync_at
expose :last_wiki_synced_at
expose :last_wiki_successful_sync_at
expose :repository_retry_count
expose :wiki_retry_count
expose :last_repository_sync_failure
expose :last_wiki_sync_failure
end
class GeoProjectRegistrySerializer < BaseSerializer
entity GeoProjectRegistryEntity
end
......@@ -127,13 +127,11 @@ module Geo
@registry ||= Geo::ProjectRegistry.find_or_initialize_by(project_id: project.id)
end
def update_registry(started_at: nil, finished_at: nil)
def update_registry!(started_at: nil, finished_at: nil, attrs: {})
return unless started_at || finished_at
log_info("Updating #{type} sync information")
attrs = {}
if started_at
attrs["last_#{type}_synced_at"] = started_at
attrs["#{type}_retry_count"] = retry_count + 1
......@@ -150,6 +148,15 @@ module Geo
registry.update!(attrs)
end
def fail_registry!(message, error, attrs = {})
log_error(message, error)
attrs["last_#{type}_sync_failure"] = "#{message}: #{error.message}"
attrs["#{type}_retry_count"] = retry_count + 1
registry.update!(attrs)
end
def type
self.class.type
end
......
......@@ -11,7 +11,7 @@ module Geo
def fetch_project_repository(redownload)
log_info('Trying to fetch project repository')
update_registry(started_at: DateTime.now)
update_registry!(started_at: DateTime.now)
if redownload
log_info('Redownloading repository')
......@@ -22,20 +22,18 @@ module Geo
fetch_geo_mirror(project.repository)
end
update_registry(finished_at: DateTime.now)
update_registry!(finished_at: DateTime.now, attrs: { last_repository_sync_failure: nil })
log_info('Finished repository sync',
update_delay_s: update_delay_in_seconds,
download_time_s: download_time_in_seconds)
rescue Gitlab::Shell::Error,
Gitlab::Git::RepositoryMirroring::RemoteError,
Geo::EmptyCloneUrlPrefixError => e
log_error('Error syncing repository', e)
registry.increment!(:repository_retry_count)
fail_registry!('Error syncing repository', e)
rescue Gitlab::Git::Repository::NoRepository => e
log_error('Invalid repository', e)
log_info('Setting force_to_redownload flag')
registry.update(force_to_redownload_repository: true,
repository_retry_count: retry_count + 1)
fail_registry!('Invalid repository', e, force_to_redownload_repository: true)
log_info('Expiring caches')
project.repository.after_create
ensure
......
module Geo
class WikiSyncService < BaseSyncService
include Gitlab::ShellAdapter
self.type = :wiki
private
......@@ -12,7 +10,7 @@ module Geo
def fetch_wiki_repository(redownload)
log_info('Fetching wiki repository')
update_registry(started_at: DateTime.now)
update_registry!(started_at: DateTime.now)
if redownload
log_info('Redownloading wiki')
......@@ -23,7 +21,7 @@ module Geo
fetch_geo_mirror(project.wiki.repository)
end
update_registry(finished_at: DateTime.now)
update_registry!(finished_at: DateTime.now, attrs: { last_wiki_sync_failure: nil })
log_info('Finished wiki sync',
update_delay_s: update_delay_in_seconds,
......@@ -32,12 +30,10 @@ module Geo
Gitlab::Shell::Error,
ProjectWiki::CouldNotCreateWikiError,
Geo::EmptyCloneUrlPrefixError => e
log_error('Error syncing wiki repository', e)
registry.increment!(:wiki_retry_count)
fail_registry!('Error syncing wiki repository', e)
rescue Gitlab::Git::Repository::NoRepository => e
log_error('Invalid wiki', e)
registry.update(force_to_redownload_wiki: true,
wiki_retry_count: retry_count + 1)
log_info('Setting force_to_redownload flag')
fail_registry!('Invalid wiki', e, force_to_redownload_wiki: true)
ensure
clean_up_temporary_repository if redownload
end
......
---
title: Improve Geo logging of repository errors
merge_request: 3402
author:
type: changed
class AddLastSyncFailureToProjectRegistry < ActiveRecord::Migration
DOWNTIME = false
def change
add_column :project_registry, :last_repository_sync_failure, :string
add_column :project_registry, :last_wiki_sync_failure, :string
end
end
......@@ -11,7 +11,7 @@
#
# It's strongly recommended that you check this file into your version control system.
ActiveRecord::Schema.define(version: 20171101105200) do
ActiveRecord::Schema.define(version: 20171115143841) do
# These are extensions that must be enabled in order to support this database
enable_extension "plpgsql"
......@@ -50,6 +50,8 @@ ActiveRecord::Schema.define(version: 20171101105200) do
t.integer "wiki_retry_count"
t.datetime "wiki_retry_at"
t.boolean "force_to_redownload_wiki"
t.string "last_repository_sync_failure"
t.string "last_wiki_sync_failure"
end
add_index "project_registry", ["last_repository_successful_sync_at"], name: "index_project_registry_on_last_repository_successful_sync_at", using: :btree
......
......@@ -139,3 +139,38 @@ Example response:
"last_successful_status_check_timestamp": 1510125268
}
```
## Retrieve project sync failures ocurred on the current node
```
GET /geo_nodes/current/failures
```
| Attribute | Type | Required | Description |
| --------- | ---- | -------- | ----------- |
| `type` | string | no | Type of failure (`repository`/`wiki`) |
This endpoint uses [Pagination](README.md#pagination).
```bash
curl --header "PRIVATE-TOKEN: 9koXpg98eAheJpvBs5tK" https://gitlab.example.com/api/v4/geo_nodes/current/failures
```
Example response:
```json
[
{
"project_id": 3,
"last_repository_synced_at": "2017-10-31 14:25:55 UTC",
"last_repository_successful_sync_at": "2017-10-31 14:26:04 UTC",
"last_wiki_synced_at": "2017-10-31 14:26:04 UTC",
"last_wiki_successful_sync_at": "2017-10-31 14:26:11 UTC",
"repository_retry_count": null,
"wiki_retry_count": 1,
"last_repository_sync_failure": null,
"last_wiki_sync_failure": "Error syncing Wiki repository"
}
]
```
......@@ -33,6 +33,27 @@ module API
present paginate(status), with: GeoNodeStatusEntity
end
# Get project registry failures for the current Geo node
#
# Example request:
# GET /geo_nodes/current/failures
desc 'Get project registry failures for the current Geo node' do
success ::GeoProjectRegistryEntity
end
params do
optional :type, type: String, values: %w[wiki repository], desc: 'Type of failure (repository/wiki)'
use :pagination
end
get '/current/failures' do
geo_node = Gitlab::Geo.current_node
not_found('Geo node not found') unless geo_node
project_registries = paginate(geo_node.filtered_project_registries(params[:type]))
present project_registries, with: ::GeoProjectRegistryEntity
end
# Get all Geo node information
#
# Example request:
......
......@@ -3,6 +3,8 @@ module Gitlab
InvalidDecryptionKeyError = Class.new(StandardError)
class JwtRequestDecoder
include LogHelpers
IAT_LEEWAY = 60.seconds.to_i
def self.geo_auth_attempt?(header)
......@@ -33,7 +35,7 @@ module Gitlab
data = decode_auth_header
rescue OpenSSL::Cipher::CipherError
message = 'Error decrypting the Geo secret from the database. Check that the primary and secondary have the same db_key_base.'
Rails.logger.error(message)
log_error(message)
raise InvalidDecryptionKeyError.new(message)
end
......@@ -54,7 +56,7 @@ module Gitlab
data&.deep_symbolize_keys!
data
rescue JWT::DecodeError => e
Rails.logger.error("Error decoding Geo request: #{e}")
log_error("Error decoding Geo request: #{e}")
return
end
end
......
{
"type": "array",
"required" : [
"project_id",
"last_repository_synced_at",
"last_repository_successful_sync_at",
"last_wiki_synced_at",
"last_wiki_successful_sync_at",
"repository_retry_count",
"wiki_retry_count",
"last_repository_sync_failure",
"last_wiki_sync_failure"
],
"properties" : {
"project_id": { "type": "integer" },
"last_repository_synced_at": { "type": ["string"] },
"last_repository_successful_sync_at": { "type": ["string"] },
"last_wiki_synced_at": { "type": ["string"] },
"last_wiki_successful_sync_at": { "type": ["string"] },
"repository_retry_count": { "type": "integer" },
"wiki_retry_count": { "type": "integer" },
"last_repository_sync_failure": { "type": ["string"] },
"last_wiki_sync_failure": { "type": ["string"] }
},
"additionalProperties": false
}
......@@ -18,14 +18,14 @@ describe API::GeoNodes, :geo, api: true do
it 'retrieves the Geo nodes if admin is logged in' do
get api("/geo_nodes", admin)
expect(response.status).to eq 200
expect(response).to have_gitlab_http_status(200)
expect(response).to match_response_schema('geo_nodes')
end
it 'denies access if not admin' do
get api('/geo_nodes', user)
expect(response.status).to eq 403
expect(response).to have_gitlab_http_status(403)
end
end
......@@ -33,14 +33,14 @@ describe API::GeoNodes, :geo, api: true do
it 'retrieves the Geo nodes if admin is logged in' do
get api("/geo_nodes/#{primary.id}", admin)
expect(response.status).to eq 200
expect(response).to have_gitlab_http_status(200)
expect(response).to match_response_schema('geo_node')
end
it 'denies access if not admin' do
get api('/geo_nodes', user)
expect(response.status).to eq 403
expect(response).to have_gitlab_http_status(403)
end
end
......@@ -48,14 +48,14 @@ describe API::GeoNodes, :geo, api: true do
it 'retrieves the Geo nodes status if admin is logged in' do
get api("/geo_nodes/status", admin)
expect(response.status).to eq 200
expect(response).to have_gitlab_http_status(200)
expect(response).to match_response_schema('geo_node_statuses')
end
it 'denies access if not admin' do
get api('/geo_nodes', user)
expect(response.status).to eq 403
expect(response).to have_gitlab_http_status(403)
end
end
......@@ -65,7 +65,7 @@ describe API::GeoNodes, :geo, api: true do
expect(GeoNodeStatus).not_to receive(:current_node_status)
get api("/geo_nodes/#{secondary.id}/status", admin)
expect(response.status).to eq 200
expect(response).to have_gitlab_http_status(200)
expect(response).to match_response_schema('geo_node_status')
end
......@@ -76,14 +76,89 @@ describe API::GeoNodes, :geo, api: true do
get api("/geo_nodes/#{secondary.id}/status", admin)
expect(response.status).to eq 200
expect(response).to have_gitlab_http_status(200)
expect(response).to match_response_schema('geo_node_status')
end
it 'denies access if not admin' do
get api('/geo_nodes', user)
expect(response.status).to eq 403
expect(response).to have_gitlab_http_status(403)
end
end
describe 'GET /geo_nodes/current/failures/:type' do
it 'fetches the current node failures' do
create(:geo_project_registry, :sync_failed)
create(:geo_project_registry, :sync_failed)
stub_current_geo_node(secondary)
expect(Gitlab::Geo).to receive(:current_node).and_return(secondary)
get api("/geo_nodes/current/failures", admin)
expect(response).to have_gitlab_http_status(200)
expect(response).to match_response_schema('geo_project_registry')
end
it 'does not show any registry when there is no failure' do
create(:geo_project_registry, :synced)
stub_current_geo_node(secondary)
expect(Gitlab::Geo).to receive(:current_node).and_return(secondary)
get api("/geo_nodes/current/failures", admin)
expect(response).to have_gitlab_http_status(200)
expect(json_response.count).to be_zero
end
context 'wiki type' do
it 'only shows wiki failures' do
create(:geo_project_registry, :wiki_sync_failed)
create(:geo_project_registry, :repository_sync_failed)
stub_current_geo_node(secondary)
expect(Gitlab::Geo).to receive(:current_node).and_return(secondary)
get api("/geo_nodes/current/failures?type=wiki", admin)
expect(response).to have_gitlab_http_status(200)
expect(json_response.count).to eq(1)
expect(json_response.first['wiki_retry_count']).to be > 0
end
end
context 'repository type' do
it 'only shows repository failures' do
create(:geo_project_registry, :wiki_sync_failed)
create(:geo_project_registry, :repository_sync_failed)
stub_current_geo_node(secondary)
expect(Gitlab::Geo).to receive(:current_node).and_return(secondary)
get api("/geo_nodes/current/failures?type=repository", admin)
expect(response).to have_gitlab_http_status(200)
expect(json_response.count).to eq(1)
expect(json_response.first['repository_retry_count']).to be > 0
end
end
context 'nonexistent type' do
it 'returns a bad request' do
create(:geo_project_registry, :repository_sync_failed)
get api("/geo_nodes/current/failures?type=nonexistent", admin)
expect(response).to have_gitlab_http_status(400)
end
end
it 'denies access if not admin' do
get api("/geo_nodes/current/failures", user)
expect(response).to have_gitlab_http_status(403)
end
end
end
require 'spec_helper'
describe GeoProjectRegistryEntity, :postgresql do
let(:registry) { create(:geo_project_registry, :synced) }
let(:entity) do
described_class.new(registry, request: double)
end
subject { entity.as_json }
it { is_expected.to have_key(:project_id) }
it { is_expected.to have_key(:last_repository_synced_at) }
it { is_expected.to have_key(:last_repository_successful_sync_at) }
it { is_expected.to have_key(:last_wiki_synced_at) }
it { is_expected.to have_key(:last_wiki_successful_sync_at) }
it { is_expected.to have_key(:repository_retry_count) }
it { is_expected.to have_key(:wiki_retry_count) }
it { is_expected.to have_key(:last_repository_sync_failure) }
it { is_expected.to have_key(:last_wiki_sync_failure) }
end
......@@ -52,6 +52,12 @@ describe Geo::RepositorySyncService do
subject.execute
end
it 'voids the failure message when it succeeds after an error' do
registry = create(:geo_project_registry, project: project, last_repository_sync_failure: 'error')
expect { subject.execute }.to change { registry.reload.last_repository_sync_failure}.to(nil)
end
it 'returns the lease when sync fail' do
allow(repository).to receive(:fetch_as_mirror).with(url_to_repo, forced: true) { raise Gitlab::Shell::Error }
......@@ -145,7 +151,7 @@ describe Geo::RepositorySyncService do
let(:registry) { Geo::ProjectRegistry.find_by(project_id: project.id) }
before do
allow(repository).to receive(:fetch_as_mirror).with(url_to_repo, forced: true) { raise Gitlab::Shell::Error }
allow(repository).to receive(:fetch_as_mirror).with(url_to_repo, forced: true) { raise Gitlab::Shell::Error, 'shell error' }
subject.execute
end
......@@ -165,6 +171,10 @@ describe Geo::RepositorySyncService do
it 'resets repository_retry_at' do
expect(registry.repository_retry_at).to be_present
end
it 'sets last_repository_sync_failure' do
expect(registry.last_repository_sync_failure).to eq('Error syncing repository: shell error')
end
end
end
......
......@@ -44,6 +44,12 @@ RSpec.describe Geo::WikiSyncService do
subject.execute
end
it 'voids the failure message when it succeeds after an error' do
registry = create(:geo_project_registry, project: project, last_wiki_sync_failure: 'error')
expect { subject.execute }.to change { registry.reload.last_wiki_sync_failure}.to(nil)
end
it 'does not fetch wiki repository if cannot obtain a lease' do
allow(lease).to receive(:try_obtain) { false }
......@@ -117,7 +123,7 @@ RSpec.describe Geo::WikiSyncService do
let(:registry) { Geo::ProjectRegistry.find_by(project_id: project.id) }
before do
allow(repository).to receive(:fetch_as_mirror).with(url_to_repo, forced: true) { raise Gitlab::Shell::Error }
allow(repository).to receive(:fetch_as_mirror).with(url_to_repo, forced: true) { raise Gitlab::Shell::Error, 'shell error' }
subject.execute
end
......@@ -129,6 +135,10 @@ RSpec.describe Geo::WikiSyncService do
it 'resets last_wiki_successful_sync_at' do
expect(registry.last_wiki_successful_sync_at).to be_nil
end
it 'sets last_wiki_sync_failure' do
expect(registry.last_wiki_sync_failure).to eq('Error syncing wiki repository: shell error')
end
end
end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment