Commit f6658ab0 authored by mo khan's avatar mo khan Committed by Mayra Cabrera

Add spdx_identifier to software_licenses

This adds the `spdx_identifier` column to the `software_licenses` table.
It also adds a sidekiq-cron to update our list of software licenses from
the SPDX database on a weekly basis. This list can offer our customers
a more comprehensive list of software licenses to choose from and
reduce the need for user entered software license names.
parent 9b4313cf
...@@ -491,6 +491,9 @@ Gitlab.ee do ...@@ -491,6 +491,9 @@ Gitlab.ee do
Settings.cron_jobs['historical_data_worker'] ||= Settingslogic.new({}) Settings.cron_jobs['historical_data_worker'] ||= Settingslogic.new({})
Settings.cron_jobs['historical_data_worker']['cron'] ||= '0 12 * * *' Settings.cron_jobs['historical_data_worker']['cron'] ||= '0 12 * * *'
Settings.cron_jobs['historical_data_worker']['job_class'] = 'HistoricalDataWorker' Settings.cron_jobs['historical_data_worker']['job_class'] = 'HistoricalDataWorker'
Settings.cron_jobs['import_software_licenses_worker'] ||= Settingslogic.new({})
Settings.cron_jobs['import_software_licenses_worker']['cron'] ||= '0 3 * * 0'
Settings.cron_jobs['import_software_licenses_worker']['job_class'] = 'ImportSoftwareLicensesWorker'
Settings.cron_jobs['ldap_group_sync_worker'] ||= Settingslogic.new({}) Settings.cron_jobs['ldap_group_sync_worker'] ||= Settingslogic.new({})
Settings.cron_jobs['ldap_group_sync_worker']['cron'] ||= '0 * * * *' Settings.cron_jobs['ldap_group_sync_worker']['cron'] ||= '0 * * * *'
Settings.cron_jobs['ldap_group_sync_worker']['job_class'] = 'LdapAllGroupsSyncWorker' Settings.cron_jobs['ldap_group_sync_worker']['job_class'] = 'LdapAllGroupsSyncWorker'
......
# frozen_string_literal: true
class AddSpdxIdToSoftwareLicenses < ActiveRecord::Migration[5.2]
DOWNTIME = false
def up
add_column :software_licenses, :spdx_identifier, :string, limit: 255
end
def down
remove_column :software_licenses, :spdx_identifier
end
end
# frozen_string_literal: true
class AddIndexToSoftwareLicensesOnSpdxId < ActiveRecord::Migration[5.2]
include Gitlab::Database::MigrationHelpers
DOWNTIME = false
disable_ddl_transaction!
def up
add_concurrent_index :software_licenses, :spdx_identifier
end
def down
remove_concurrent_index :software_licenses, :spdx_identifier
end
end
# frozen_string_literal: true
class BackfillSoftwareLicensesSpdxIdentifiers < ActiveRecord::Migration[5.2]
include Gitlab::Database::MigrationHelpers
DOWNTIME = false
CURRENT_LICENSES = {
'AGPL-1.0' => 'AGPL-1.0',
'AGPL-3.0' => 'AGPL-3.0',
'Apache 2.0' => 'Apache-2.0',
'Artistic-2.0' => 'Artistic-2.0',
'BSD' => 'BSD-4-Clause',
'CC0 1.0 Universal' => 'CC0-1.0',
'CDDL-1.0' => 'CDDL-1.0',
'CDDL-1.1' => 'CDDL-1.1',
'EPL-1.0' => 'EPL-1.0',
'EPL-2.0' => 'EPL-2.0',
'GPLv2' => 'GPL-2.0',
'GPLv3' => 'GPL-3.0',
'ISC' => 'ISC',
'LGPL' => 'LGPL-3.0-only',
'LGPL-2.1' => 'LGPL-2.1',
'MIT' => 'MIT',
'Mozilla Public License 2.0' => 'MPL-2.0',
'MS-PL' => 'MS-PL',
'MS-RL' => 'MS-RL',
'New BSD' => 'BSD-3-Clause',
'Python Software Foundation License' => 'Python-2.0',
'ruby' => 'Ruby',
'Simplified BSD' => 'BSD-2-Clause',
'WTFPL' => 'WTFPL',
'Zlib' => 'Zlib'
}.freeze
disable_ddl_transaction!
# 25 records to be updated on GitLab.com
def up
return unless Gitlab.ee?
say "Expect #{CURRENT_LICENSES.count} updates to the software_licenses table to occur"
CURRENT_LICENSES.each do |name, spdx_identifier|
# The following cop is disabled because of https://gitlab.com/gitlab-org/gitlab/issues/33470
# For more context see https://gitlab.com/gitlab-org/gitlab/merge_requests/17004#note_226264823
# rubocop:disable Migration/UpdateColumnInBatches
update_column_in_batches(:software_licenses, :spdx_identifier, spdx_identifier) do |table, query|
query.where(table[:name].eq(name))
end
end
end
def down
return unless Gitlab.ee?
update_column_in_batches(:software_licenses, :spdx_identifier, nil)
end
end
...@@ -3350,7 +3350,9 @@ ActiveRecord::Schema.define(version: 2019_09_29_180827) do ...@@ -3350,7 +3350,9 @@ ActiveRecord::Schema.define(version: 2019_09_29_180827) do
create_table "software_licenses", id: :serial, force: :cascade do |t| create_table "software_licenses", id: :serial, force: :cascade do |t|
t.string "name", null: false t.string "name", null: false
t.string "spdx_identifier", limit: 255
t.index ["name"], name: "index_software_licenses_on_name" t.index ["name"], name: "index_software_licenses_on_name"
t.index ["spdx_identifier"], name: "index_software_licenses_on_spdx_identifier"
end end
create_table "spam_logs", id: :serial, force: :cascade do |t| create_table "spam_logs", id: :serial, force: :cascade do |t|
......
...@@ -6,8 +6,12 @@ class SoftwareLicense < ApplicationRecord ...@@ -6,8 +6,12 @@ class SoftwareLicense < ApplicationRecord
include Presentable include Presentable
validates :name, presence: true validates :name, presence: true
validates :spdx_identifier, length: { maximum: 255 }
scope :by_name, -> (names) { where(name: names) }
scope :ordered, -> { order(:name) } scope :ordered, -> { order(:name) }
scope :unknown, -> { where(spdx_identifier: nil) }
scope :grouped_by_name, -> { group(:name) }
def self.create_policy_for!(project:, name:, approval_status:) def self.create_policy_for!(project:, name:, approval_status:)
project.software_license_policies.create!( project.software_license_policies.create!(
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
- cronjob:geo_repository_verification_secondary_shard - cronjob:geo_repository_verification_secondary_shard
- cronjob:geo_container_repository_sync_dispatch - cronjob:geo_container_repository_sync_dispatch
- cronjob:historical_data - cronjob:historical_data
- cronjob:import_software_licenses
- cronjob:ldap_all_groups_sync - cronjob:ldap_all_groups_sync
- cronjob:ldap_sync - cronjob:ldap_sync
- cronjob:update_all_mirrors - cronjob:update_all_mirrors
......
# frozen_string_literal: true
class ImportSoftwareLicensesWorker
include ApplicationWorker
queue_namespace :cronjob
def perform
catalogue.each do |spdx_license|
if unknown_licenses[spdx_license.name]
unknown_licenses_with(spdx_license.name)
.update_all(spdx_identifier: spdx_license.id)
else
SoftwareLicense.safe_find_or_create_by!(
name: spdx_license.name,
spdx_identifier: spdx_license.id
)
end
end
end
private
def unknown_licenses
@unknown_licenses ||=
unknown_licenses_with(catalogue.map(&:name)).grouped_by_name.count
end
def unknown_licenses_with(name)
SoftwareLicense.unknown.by_name(name)
end
def catalogue
@catalogue ||= Gitlab::SPDX::Catalogue.latest
end
end
---
title: Backfill SPDX identifiers in software_licenses table
merge_request: 17004
author:
type: fixed
# frozen_string_literal: true
module Gitlab
module SPDX
class Catalogue
include Enumerable
def initialize(catalogue = {})
@catalogue = catalogue
end
def version
catalogue[:licenseListVersion]
end
def each
licenses.each do |license|
yield license if license.id.present?
end
end
def self.latest
CatalogueGateway.new.fetch
end
private
attr_reader :catalogue
def licenses
@licenses ||= catalogue.fetch(:licenses, []).map { |x| map_from(x) }
end
def map_from(license_hash)
::Gitlab::SPDX::License.new(id: license_hash[:licenseId], name: license_hash[:name])
end
end
end
end
# frozen_string_literal: true
module Gitlab
module SPDX
class CatalogueGateway
URL = 'https://spdx.org/licenses/licenses.json'
def fetch
response = ::Gitlab::HTTP.get(URL)
if response.success?
parse(response.body)
else
record_failure(http_status_code: response.code)
empty_catalogue
end
rescue *::Gitlab::HTTP::HTTP_ERRORS => error
record_failure(error_message: error.message)
empty_catalogue
end
private
def parse(json)
build_catalogue(JSON.parse(json, symbolize_names: true))
end
def record_failure(tags = {})
Gitlab::Metrics.add_event(:spdx_fetch_failed, tags)
end
def empty_catalogue
build_catalogue(licenses: [])
end
def build_catalogue(hash)
::Gitlab::SPDX::Catalogue.new(hash)
end
end
end
end
# frozen_string_literal: true
module Gitlab
module SPDX
License = Struct.new(:id, :name, keyword_init: true)
end
end
# frozen_string_literal: true
FactoryBot.define do
factory :spdx_catalogue, class: ::Gitlab::SPDX::Catalogue do
initialize_with do
content = IO.read(Rails.root.join('spec', 'fixtures', 'spdx.json'))
::Gitlab::SPDX::Catalogue.new(JSON.parse(content, symbolize_names: true))
end
end
end
# frozen_string_literal: true
FactoryBot.define do
factory :spdx_license, class: ::Gitlab::SPDX::License do
id { |n| "License-#{n}" }
name { |n| "License #{n}" }
trait :apache_1 do
id { 'Apache-1.0' }
name { 'Apache License 1.0' }
end
trait :bsd do
id { 'BSD-4-Clause' }
name { 'BSD 4-Clause "Original" or "Old" License' }
end
trait :mit do
id { 'MIT' }
name { 'MIT License' }
end
end
end
# frozen_string_literal: true
require 'spec_helper'
describe Gitlab::SPDX::CatalogueGateway do
include StubRequests
describe "#fetch" do
let(:result) { subject.fetch }
let(:url) { described_class::URL }
context "when the licenses.json endpoint is healthy" do
let(:spdx_json) { IO.read(Rails.root.join("spec", "fixtures", "spdx.json")) }
let(:catalogue_hash) { JSON.parse(spdx_json, symbolize_names: true) }
before do
stub_full_request(url, method: :get).to_return(status: 200, body: spdx_json)
end
it { expect(result.count).to be(catalogue_hash[:licenses].count) }
end
context "when the licenses.json endpoint is not reachable" do
before do
allow(Gitlab::Metrics).to receive(:add_event)
stub_full_request(url, method: :get).to_return(status: 404)
result
end
it { expect(result.count).to be_zero }
it { expect(Gitlab::Metrics).to have_received(:add_event).with(:spdx_fetch_failed, http_status_code: 404) }
end
Gitlab::HTTP::HTTP_ERRORS.each do |error|
context "when an `#{error}` is raised while trying to connect to the endpoint" do
before do
allow(Gitlab::Metrics).to receive(:add_event)
stub_full_request(url, method: :get).and_raise(error)
result
end
it { expect(result.count).to be_zero }
it { expect(Gitlab::Metrics).to have_received(:add_event).with(:spdx_fetch_failed, anything) }
end
end
end
end
# frozen_string_literal: true
require "spec_helper"
RSpec.describe Gitlab::SPDX::Catalogue do
include StubRequests
subject { described_class.new(catalogue_hash) }
let(:spdx_json) { IO.read(Rails.root.join("spec", "fixtures", "spdx.json")) }
let(:catalogue_hash) { JSON.parse(spdx_json, symbolize_names: true) }
describe "#version" do
let(:version) { SecureRandom.uuid }
it { expect(described_class.new(licenseListVersion: version).version).to eql(version) }
end
describe "#each" do
it { expect(subject.count).to eql(catalogue_hash[:licenses].count) }
it { expect(subject.map(&:id)).to match_array(catalogue_hash[:licenses].map { |x| x[:licenseId] }) }
it { expect(subject.map(&:name)).to match_array(catalogue_hash[:licenses].map { |x| x[:name] }) }
context "when some of the licenses are missing an identifier" do
let(:catalogue_hash) do
{
licenseListVersion: "3.6",
licenses: [
{ licenseId: nil, name: "nil" },
{ licenseId: "", name: "blank" },
{ licenseId: "valid", name: "valid" }
]
}
end
it { expect(subject.count).to be(1) }
it { expect(subject.map(&:id)).to contain_exactly("valid") }
end
context "when the schema of each license changes" do
let(:catalogue_hash) do
{
licenseListVersion: "3.6",
licenses: [
{
"license-ID": 'MIT',
name: "MIT License"
}
]
}
end
it { expect(subject.count).to be_zero }
end
context "when the schema of the catalogue changes" do
let(:catalogue_hash) { { SecureRandom.uuid.to_sym => [{ id: 'MIT', name: "MIT License" }] } }
it { expect(subject.count).to be_zero }
end
end
describe ".latest" do
subject { described_class.latest }
context "when the licenses.json endpoint is healthy" do
let(:gateway) { instance_double(Gitlab::SPDX::CatalogueGateway, fetch: catalogue) }
let(:catalogue) { instance_double(described_class) }
before do
allow(Gitlab::SPDX::CatalogueGateway).to receive(:new).and_return(gateway)
end
it { expect(subject).to be(catalogue) }
end
end
end
# frozen_string_literal: true
require 'spec_helper'
require Rails.root.join('db', 'post_migrate', '20190917173107_backfill_software_licenses_spdx_identifiers.rb')
describe BackfillSoftwareLicensesSpdxIdentifiers, :migration do
let(:software_licenses) { table(:software_licenses) }
describe '#up' do
let(:javascript_default_names) { expected_identifiers_for_javascript_default_names.keys }
let(:expected_identifiers_for_javascript_default_names) do
{
'AGPL-1.0' => 'AGPL-1.0',
'AGPL-3.0' => 'AGPL-3.0',
'Apache 2.0' => 'Apache-2.0',
'Artistic-2.0' => 'Artistic-2.0',
'BSD' => 'BSD-4-Clause',
'CC0 1.0 Universal' => 'CC0-1.0',
'CDDL-1.0' => 'CDDL-1.0',
'CDDL-1.1' => 'CDDL-1.1',
'EPL-1.0' => 'EPL-1.0',
'EPL-2.0' => 'EPL-2.0',
'GPLv2' => 'GPL-2.0',
'GPLv3' => 'GPL-3.0',
'ISC' => 'ISC',
'LGPL' => 'LGPL-3.0-only',
'LGPL-2.1' => 'LGPL-2.1',
'MIT' => 'MIT',
'Mozilla Public License 2.0' => 'MPL-2.0',
'MS-PL' => 'MS-PL',
'MS-RL' => 'MS-RL',
'New BSD' => 'BSD-3-Clause',
'Python Software Foundation License' => 'Python-2.0',
'ruby' => 'Ruby',
'Simplified BSD' => 'BSD-2-Clause',
'WTFPL' => 'WTFPL',
'Zlib' => 'Zlib'
}
end
before do
software_licenses.create!(javascript_default_names.map { |name| { name: name } })
end
it 'updates the default license names that are hardcoded in javascript' do
expect(software_licenses.where(spdx_identifier: nil).count).to eq(javascript_default_names.count)
migrate!
expect(software_licenses.where(spdx_identifier: nil).count).to eq(0)
software_licenses.find_each do |license|
expect(license.spdx_identifier).to eql(expected_identifiers_for_javascript_default_names[license.name])
end
end
end
describe '#down' do
it 'resets the `spdx_identifier`' do
mit = software_licenses.create!(name: 'MIT')
migrate!
expect(mit.reload.spdx_identifier).to eql('MIT')
schema_migrate_down!
expect(mit.reload.spdx_identifier).to be_nil
end
end
end
...@@ -8,13 +8,14 @@ describe SoftwareLicense do ...@@ -8,13 +8,14 @@ describe SoftwareLicense do
describe 'validations' do describe 'validations' do
it { is_expected.to include_module(Presentable) } it { is_expected.to include_module(Presentable) }
it { is_expected.to validate_presence_of(:name) } it { is_expected.to validate_presence_of(:name) }
it { is_expected.to validate_length_of(:spdx_identifier).is_at_most(255) }
end end
describe ".create_policy_for!" do describe '.create_policy_for!' do
subject { described_class } subject { described_class }
let(:project) { create(:project) } let(:project) { create(:project) }
context "when a software license with a given name has already been created" do context 'when a software license with a given name has already been created' do
let(:mit_license) { create(:software_license, :mit) } let(:mit_license) { create(:software_license, :mit) }
let(:result) { subject.create_policy_for!(project: project, name: mit_license.name, approval_status: :approved) } let(:result) { subject.create_policy_for!(project: project, name: mit_license.name, approval_status: :approved) }
...@@ -23,7 +24,7 @@ describe SoftwareLicense do ...@@ -23,7 +24,7 @@ describe SoftwareLicense do
specify { expect(result.software_license).to eql(mit_license) } specify { expect(result.software_license).to eql(mit_license) }
end end
context "when a software license with a given name has NOT been created" do context 'when a software license with a given name has NOT been created' do
let(:license_name) { SecureRandom.uuid } let(:license_name) { SecureRandom.uuid }
let(:result) { subject.create_policy_for!(project: project, name: license_name, approval_status: :blacklisted) } let(:result) { subject.create_policy_for!(project: project, name: license_name, approval_status: :blacklisted) }
...@@ -33,4 +34,27 @@ describe SoftwareLicense do ...@@ -33,4 +34,27 @@ describe SoftwareLicense do
specify { expect(result.software_license.name).to eql(license_name) } specify { expect(result.software_license.name).to eql(license_name) }
end end
end end
describe 'scopes' do
subject { described_class }
let!(:mit) { create(:software_license, :mit, spdx_identifier: 'MIT') }
let!(:apache_2) { create(:software_license, :apache_2_0, spdx_identifier: nil) }
describe '.by_name' do
it { expect(subject.by_name(mit.name)).to contain_exactly(mit) }
end
describe '.unknown' do
it { expect(subject.unknown).to contain_exactly(apache_2) }
end
describe '.grouped_by_name' do
it { expect(subject.grouped_by_name.count).to eql(mit.name => 1, apache_2.name => 1) }
end
describe '.ordered' do
it { expect(subject.ordered.pluck(:name)).to eql([apache_2.name, mit.name]) }
end
end
end end
# frozen_string_literal: true
require 'spec_helper'
describe ImportSoftwareLicensesWorker do
let(:catalogue) { build(:spdx_catalogue) }
let(:spdx_apache_license) { build(:spdx_license, :apache_1) }
let(:spdx_bsd_license) { build(:spdx_license, :bsd) }
let(:spdx_mit_license) { build(:spdx_license, :mit) }
before do
allow(Gitlab::SPDX::Catalogue).to receive(:latest).and_return(catalogue)
allow(catalogue).to receive(:each)
.and_yield(spdx_apache_license)
.and_yield(spdx_bsd_license)
.and_yield(spdx_mit_license)
end
describe '#perform' do
let!(:apache) { create(:software_license, name: spdx_apache_license.name, spdx_identifier: nil) }
let!(:mit) { create(:software_license, name: spdx_mit_license.name, spdx_identifier: spdx_mit_license.id) }
context 'when the licenses.json endpoint is healthy' do
before do
subject.perform
end
it { expect(apache.reload.spdx_identifier).to eql(spdx_apache_license.id) }
it { expect(SoftwareLicense.count).to eq(3) }
it { expect(SoftwareLicense.pluck(:spdx_identifier)).to contain_exactly(spdx_apache_license.id, spdx_mit_license.id, spdx_bsd_license.id) }
it { expect(SoftwareLicense.pluck(:name)).to contain_exactly(spdx_apache_license.name, spdx_mit_license.name, spdx_bsd_license.name) }
end
context 'when run multiple times' do
it 'does not create duplicated software licenses' do
subject.perform
expect(SoftwareLicense.count).to eq(3)
expect { subject.perform }.not_to change(SoftwareLicense, :count)
end
end
end
end
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment