Commit e3dc3bfc authored by Oswaldo Ferreira's avatar Oswaldo Ferreira

Seed data for users, projects and its relations

It handles the very basics for batch inserting
records using the seed database, also showing
time estimatives.

It leverages generate_series() (from postgres) and
Gitlab::Database.bulk_insert methods for
inserting a great amount of users, projects, routes
and namespaces.
parent 9da64b22
......@@ -336,7 +336,6 @@ group :development do
gem 'letter_opener_web', '~> 1.3.4'
gem 'rblineprof', '~> 0.3.6', platform: :mri, require: false
gem 'active_record-pg_generate_series', '~> 0.1.2'
# Better errors handler
gem 'better_errors', '~> 2.5.0'
......
......@@ -29,8 +29,6 @@ GEM
erubi (~> 1.4)
rails-dom-testing (~> 2.0)
rails-html-sanitizer (~> 1.0, >= 1.0.3)
active_record-pg_generate_series (0.1.3)
activerecord
activejob (5.2.3)
activesupport (= 5.2.3)
globalid (>= 0.3.6)
......@@ -1090,7 +1088,6 @@ DEPENDENCIES
RedCloth (~> 4.3.2)
ace-rails-ap (~> 4.1.0)
acme-client (~> 2.0.2)
active_record-pg_generate_series (~> 0.1.2)
activerecord-explain-analyze (~> 0.1)
acts-as-taggable-on (~> 6.0)
addressable (~> 2.5.2)
......
# frozen_string_literal: true
class Gitlab::Seeder::Users
include ActionView::Helpers::NumberHelper
RANDOM_USERS_COUNT = 20
MASS_USERS_COUNT = 1_500_000
MASS_USERS_COUNT = ENV['CI'] ? 10 : 1_000_000
MASS_INSERT_USERNAME_START = 'mass_insert_user_'
attr_reader :opts
......@@ -12,13 +15,43 @@ class Gitlab::Seeder::Users
def seed!
Sidekiq::Testing.inline! do
create_random_users!
create_mass_users!
create_random_users!
end
end
private
def create_mass_users!
encrypted_password = Devise::Encryptor.digest(User, '12345678')
Gitlab::Seeder.with_mass_insert(MASS_USERS_COUNT, User) do
ActiveRecord::Base.connection.execute <<~SQL
INSERT INTO users (username, name, email, confirmed_at, projects_limit, encrypted_password)
SELECT
'#{MASS_INSERT_USERNAME_START}' || seq,
'Seed user ' || seq,
'seed_user' || seq || '@example.com',
to_timestamp(seq),
#{MASS_USERS_COUNT},
'#{encrypted_password}'
FROM generate_series(1, #{MASS_USERS_COUNT}) AS seq
SQL
end
relation = User.where(admin: false)
Gitlab::Seeder.with_mass_insert(relation.count, Namespace) do
ActiveRecord::Base.connection.execute <<~SQL
INSERT INTO namespaces (name, path, owner_id)
SELECT
username,
username,
id
FROM users WHERE NOT admin
SQL
end
end
def create_random_users!
RANDOM_USERS_COUNT.times do |i|
begin
......@@ -36,26 +69,6 @@ class Gitlab::Seeder::Users
end
end
end
def create_mass_users!
# Disable database insertion logs so speed isn't limited by ability to print to console
old_logger = ActiveRecord::Base.logger
ActiveRecord::Base.logger = nil
encrypted_password = Devise::Encryptor.digest(User, '12345678')
User.insert_using_generate_series(1, MASS_USERS_COUNT, debug: true) do |sql|
sql.username = raw("'user' || seq")
sql.name = raw("'User ' || seq")
sql.email = raw("'user' || seq || '@example.com'")
sql.confirmed_at = raw("('1388530801'::timestamp + seq)::date") # 2014-01-01
sql.encrypted_password = encrypted_password
end
puts "\n#{number_with_delimiter(MASS_USERS_COUNT)} users created!"
# Reset logging
ActiveRecord::Base.logger = old_logger
end
end
Gitlab::Seeder.quiet do
......
require './spec/support/sidekiq'
# rubocop:disable Rails/Output
class Gitlab::Seeder::Projects
include ActionView::Helpers::NumberHelper
Sidekiq::Testing.inline! do
Gitlab::Seeder.quiet do
Gitlab::Seeder.without_gitaly_timeout do
project_urls = %w[
PROJECT_URLS = %w[
https://gitlab.com/gitlab-org/gitlab-test.git
https://gitlab.com/gitlab-org/gitlab-shell.git
https://gitlab.com/gnuwget/wget2.git
......@@ -39,8 +37,7 @@ Sidekiq::Testing.inline! do
https://github.com/opencontainers/runc.git
https://github.com/googlesamples/android-topeka.git
]
large_project_urls = %w[
LARGE_PROJECT_URLS = %w[
https://github.com/torvalds/linux.git
https://gitlab.gnome.org/GNOME/gimp.git
https://gitlab.gnome.org/GNOME/gnome-mud.git
......@@ -48,8 +45,62 @@ Sidekiq::Testing.inline! do
https://gitlab.com/inkscape/inkscape.git
https://github.com/gnachman/iTerm2.git
]
# Consider altering MASS_USERS_COUNT for less
# users with projects.
MASS_PROJECTS_COUNT_PER_USER = {
private: 3, # 3m projects +
internal: 1, # 1m projects +
public: 1 # 1m projects = 5m total
}
MASS_INSERT_NAME_START = 'mass_insert_project_'
def seed!
Sidekiq::Testing.inline! do
create_real_projects!
create_large_projects!
create_mass_projects!
end
end
private
def create_real_projects!
# You can specify how many projects you need during seed execution
size = ENV['SIZE'].present? ? ENV['SIZE'].to_i : 8
PROJECT_URLS.first(size).each_with_index do |url, i|
create_real_project!(url, force_latest_storage: i.even?)
end
end
def create_large_projects!
return unless ENV['LARGE_PROJECTS'].present?
LARGE_PROJECT_URLS.each(&method(:create_real_project!))
if ENV['FORK'].present?
puts "\nGenerating forks"
project_name = ENV['FORK'] == 'true' ? 'torvalds/linux' : ENV['FORK']
project = Project.find_by_full_path(project_name)
User.offset(1).first(5).each do |user|
new_project = ::Projects::ForkService.new(project, user).execute
def create_project(url, force_latest_storage: false)
if new_project.valid? && (new_project.valid_repo? || new_project.import_state.scheduled?)
print '.'
else
new_project.errors.full_messages.each do |error|
puts "#{new_project.full_path}: #{error}"
end
print 'F'
end
end
end
end
def create_real_project!(url, force_latest_storage: false)
group_path, project_path = url.split('/')[-2..-1]
group = Group.find_by(path: group_path)
......@@ -83,7 +134,7 @@ Sidekiq::Testing.inline! do
project = nil
Sidekiq::Worker.skipping_transaction_check do
project = Projects::CreateService.new(User.first, params).execute
project = ::Projects::CreateService.new(User.first, params).execute
# Seed-Fu runs this entire fixture in a transaction, so the `after_commit`
# hook won't run until after the fixture is loaded. That is too late
......@@ -101,37 +152,59 @@ Sidekiq::Testing.inline! do
end
end
# You can specify how many projects you need during seed execution
size = ENV['SIZE'].present? ? ENV['SIZE'].to_i : 8
project_urls.first(size).each_with_index do |url, i|
create_project(url, force_latest_storage: i.even?)
end
if ENV['LARGE_PROJECTS'].present?
large_project_urls.each(&method(:create_project))
if ENV['FORK'].present?
puts "\nGenerating forks"
project_name = ENV['FORK'] == 'true' ? 'torvalds/linux' : ENV['FORK']
project = Project.find_by_full_path(project_name)
User.offset(1).first(5).each do |user|
new_project = Projects::ForkService.new(project, user).execute
if new_project.valid? && (new_project.valid_repo? || new_project.import_state.scheduled?)
print '.'
else
new_project.errors.full_messages.each do |error|
puts "#{new_project.full_path}: #{error}"
end
print 'F'
end
end
end
end
def create_mass_projects!
projects_per_user_count = MASS_PROJECTS_COUNT_PER_USER.values.sum
visibility_per_user = ['private'] * MASS_PROJECTS_COUNT_PER_USER.fetch(:private) +
['internal'] * MASS_PROJECTS_COUNT_PER_USER.fetch(:internal) +
['public'] * MASS_PROJECTS_COUNT_PER_USER.fetch(:public)
visibility_level_per_user = visibility_per_user.map { |visibility| Gitlab::VisibilityLevel.level_value(visibility) }
visibility_per_user = visibility_per_user.join(',')
visibility_level_per_user = visibility_level_per_user.join(',')
Gitlab::Seeder.with_mass_insert(User.count * projects_per_user_count, "Projects and relations") do
ActiveRecord::Base.connection.execute <<~SQL
INSERT INTO projects (name, path, creator_id, namespace_id, visibility_level, created_at, updated_at)
SELECT
'Seed project ' || seq || ' ' || ('{#{visibility_per_user}}'::text[])[seq] AS project_name,
'mass_insert_project_' || ('{#{visibility_per_user}}'::text[])[seq] || '_' || seq AS project_path,
u.id AS user_id,
n.id AS namespace_id,
('{#{visibility_level_per_user}}'::int[])[seq] AS visibility_level,
NOW() AS created_at,
NOW() AS updated_at
FROM users u
CROSS JOIN generate_series(1, #{projects_per_user_count}) AS seq
JOIN namespaces n ON n.owner_id=u.id
SQL
ActiveRecord::Base.connection.execute <<~SQL
INSERT INTO project_features (project_id, merge_requests_access_level, issues_access_level, wiki_access_level,
pages_access_level)
SELECT
id,
#{ProjectFeature::ENABLED} AS merge_requests_access_level,
#{ProjectFeature::ENABLED} AS issues_access_level,
#{ProjectFeature::ENABLED} AS wiki_access_level,
#{ProjectFeature::ENABLED} AS pages_access_level
FROM projects ON CONFLICT (project_id) DO NOTHING;
SQL
ActiveRecord::Base.connection.execute <<~SQL
INSERT INTO routes (source_id, source_type, name, path)
SELECT
p.id,
'Project',
u.name || ' / ' || p.name,
u.username || '/' || p.path
FROM projects p JOIN users u ON u.id=p.creator_id
ON CONFLICT (source_type, source_id) DO NOTHING;
SQL
end
end
end
Gitlab::Seeder.quiet do
projects = Gitlab::Seeder::Projects.new
projects.seed!
end
......@@ -43,7 +43,7 @@ Gitlab::Seeder.quiet do
end
puts "\nGenerating project labels"
Project.all.find_each do |project|
Project.not_mass_generated.find_each do |project|
Gitlab::Seeder::ProjectLabels.new(project).seed!
end
end
require './spec/support/sidekiq'
class Gitlab::Seeder::Projects
include ActionView::Helpers::NumberHelper
PROJECT_URLS = [
'https://gitlab.com/gitlab-org/gitlab-test.git',
'https://gitlab.com/gitlab-org/gitlab-ce.git',
'https://gitlab.com/gitlab-org/gitlab-ci.git',
'https://gitlab.com/gitlab-org/gitlab-shell.git',
'https://github.com/documentcloud/underscore.git',
'https://github.com/twitter/flight.git',
'https://github.com/twitter/typeahead.js.git',
'https://github.com/h5bp/html5-boilerplate.git',
'https://github.com/google/material-design-lite.git',
'https://github.com/jlevy/the-art-of-command-line.git',
'https://github.com/FreeCodeCamp/freecodecamp.git',
'https://github.com/google/deepdream.git',
'https://github.com/jtleek/datasharing.git',
'https://github.com/WebAssembly/design.git',
'https://github.com/airbnb/javascript.git',
'https://github.com/tessalt/echo-chamber-js.git',
'https://github.com/atom/atom.git',
'https://github.com/mattermost/platform.git',
'https://github.com/purifycss/purifycss.git',
'https://github.com/facebook/nuclide.git',
'https://github.com/wbkd/awesome-d3.git',
'https://github.com/kilimchoi/engineering-blogs.git',
'https://github.com/gilbarbara/logos.git',
'https://github.com/gaearon/redux.git',
'https://github.com/awslabs/s2n.git',
'https://github.com/arkency/reactjs_koans.git',
'https://github.com/twbs/bootstrap.git',
'https://github.com/chjj/ttystudio.git',
'https://github.com/DrBoolean/mostly-adequate-guide.git',
'https://github.com/octocat/Spoon-Knife.git',
'https://github.com/opencontainers/runc.git',
'https://github.com/googlesamples/android-topeka.git'
]
MASS_PROJECTS_COUNT = {
private: 2_000_000,
internal: 30_000,
public: 265_000
}
attr_reader :opts
def initialize(opts = {})
@opts = opts
end
def seed!
Sidekiq::Testing.inline! do
create_real_projects!(opts[:count])
create_mass_projects!
end
end
private
def create_real_projects!(count)
PROJECT_URLS.first(count).each_with_index do |url, i|
group_path, project_path = url.split('/')[-2..-1]
group = Group.find_by(path: group_path)
unless group
group = Group.new(
name: group_path.titleize,
path: group_path
)
group.description = FFaker::Lorem.sentence
group.save
group.add_owner(User.first)
end
project_path.gsub!(".git", "")
params = {
import_url: url,
namespace_id: group.id,
name: project_path.titleize,
description: FFaker::Lorem.sentence,
visibility_level: Gitlab::VisibilityLevel.values.sample
}
project = ::Projects::CreateService.new(User.first, params).execute
# Seed-Fu runs this entire fixture in a transaction, so the `after_commit`
# hook won't run until after the fixture is loaded. That is too late
# since the Sidekiq::Testing block has already exited. Force clearing
# the `after_commit` queue to ensure the job is run now.
project.send(:_run_after_commit_queue)
if project.valid? && project.valid_repo?
print '.'
else
puts project.errors.full_messages
print 'F'
end
end
end
def create_mass_projects!
# Disable database insertion logs so speed isn't limited by ability to print to console
old_logger = ActiveRecord::Base.logger
ActiveRecord::Base.logger = nil
create_mass_projects_by_visility!(:private)
create_mass_projects_by_visility!(:internal)
create_mass_projects_by_visility!(:public)
# Reset logging
ActiveRecord::Base.logger = old_logger
end
def create_mass_projects_by_visility!(visibility)
users = User.limit(100)
groups = Group.limit(100)
namespaces = users + groups
Project.insert_using_generate_series(1, MASS_PROJECTS_COUNT[visibility], debug: true) do |sql|
project_name = raw("'seed_#{visibility}_project_' || seq")
namespace = namespaces.take
sql.name = project_name
sql.path = project_name
sql.creator_id = namespace.is_a?(Group) ? namespace.owner_id : users.take.id
sql.namespace_id = namespace.is_a?(Group) ? namespace.id : namespace.namespace_id
sql.visibility_level = Gitlab::VisibilityLevel.level_value(visibility.to_s)
end
puts "#{number_with_delimiter(MASS_PROJECTS_COUNT[visibility])} projects created!"
end
end
Gitlab::Seeder.quiet do
count = ENV['SIZE'].present? ? ENV['SIZE'].to_i : 8
projects = Gitlab::Seeder::Projects.new(count: count)
projects.seed!
end
......@@ -3,7 +3,7 @@ require './spec/support/sidekiq'
Sidekiq::Testing.inline! do
Gitlab::Seeder.quiet do
Group.all.each do |group|
User.all.sample(4).each do |user|
User.not_mass_generated.sample(4).each do |user|
if group.add_user(user, Gitlab::Access.values.sample).persisted?
print '.'
else
......@@ -12,8 +12,8 @@ Sidekiq::Testing.inline! do
end
end
Project.all.each do |project|
User.all.sample(4).each do |user|
Project.not_mass_generated.each do |project|
User.not_mass_generated.sample(4).each do |user|
if project.add_role(user, Gitlab::Access.sym_options.keys.sample)
print '.'
else
......
require './spec/support/sidekiq'
Gitlab::Seeder.quiet do
Project.all.each do |project|
Project.not_mass_generated.each do |project|
5.times do |i|
milestone_params = {
title: "v#{i}.0",
......
......@@ -4,7 +4,13 @@ Gitlab::Seeder.quiet do
# Limit the number of merge requests per project to avoid long seeds
MAX_NUM_MERGE_REQUESTS = 10
Project.non_archived.with_merge_requests_enabled.reject(&:empty_repo?).each do |project|
projects = Project
.non_archived
.with_merge_requests_enabled
.not_mass_generated
.reject(&:empty_repo?)
projects.each do |project|
branches = project.repository.branch_names.sample(MAX_NUM_MERGE_REQUESTS * 2)
branches.each do |branch_name|
......
......@@ -9,7 +9,7 @@ Sidekiq::Testing.disable! do
# that it falls under `Sidekiq::Testing.disable!`.
Key.skip_callback(:commit, :after, :add_to_shell)
User.first(10).each do |user|
User.not_mass_generated.first(10).each do |user|
key = "ssh-rsa AAAAB3NzaC1yc2EAAAABJQAAAIEAiPWx6WM4lhHNedGfBpPJNPpZ7yKu+dnn1SJejgt#{user.id + 100}6k6YjzGGphH2TUxwKzxcKDKKezwkpfnxPkSMkuEspGRt/aZZ9wa++Oi7Qkr8prgHc4soW6NUlfDzpvZK2H5E7eQaSeP3SAwGmQKUFHCddNaP0L+hM7zhFNzjFvpaMgJw0="
key = user.keys.create(
......
......@@ -25,7 +25,7 @@ end
eos
50.times do |i|
user = User.all.sample
user = User.not_mass_generated.sample
PersonalSnippet.seed(:id, [{
id: i,
......
......@@ -214,7 +214,7 @@ class Gitlab::Seeder::Pipelines
end
Gitlab::Seeder.quiet do
Project.all.sample(5).each do |project|
Project.not_mass_generated.sample(5).each do |project|
project_builds = Gitlab::Seeder::Pipelines.new(project)
project_builds.seed!
end
......
......@@ -3,7 +3,7 @@ require './spec/support/sidekiq'
Gitlab::Seeder.quiet do
admin_user = User.find(1)
Project.all.each do |project|
Project.not_mass_generated.each do |project|
params = {
name: 'master'
}
......
......@@ -217,7 +217,7 @@ Gitlab::Seeder.quiet do
flag = 'SEED_CYCLE_ANALYTICS'
if ENV[flag]
Project.find_each do |project|
Project.not_mass_generated.find_each do |project|
# This seed naively assumes that every project has a repository, and every
# repository has a `master` branch, which may be the case for a pristine
# GDK seed, but is almost never true for a GDK that's actually had
......
......@@ -67,7 +67,7 @@ class Gitlab::Seeder::Environments
end
Gitlab::Seeder.quiet do
Project.all.sample(5).each do |project|
Project.not_mass_generated.sample(5).each do |project|
project_environments = Gitlab::Seeder::Environments.new(project)
project_environments.seed!
end
......
......@@ -22,7 +22,7 @@ module Db
end
def self.random_user
User.find(User.pluck(:id).sample)
User.find(User.not_mass_generated.pluck(:id).sample)
end
end
end
......
......@@ -2,8 +2,8 @@ require './spec/support/sidekiq'
Sidekiq::Testing.inline! do
Gitlab::Seeder.quiet do
User.all.sample(10).each do |user|
source_project = Project.public_only.sample
User.not_mass_generated.sample(10).each do |user|
source_project = Project.not_mass_generated.public_only.sample
##
# 03_project.rb might not have created a public project because
......
......@@ -12,6 +12,14 @@ The `setup` task is an alias for `gitlab:setup`.
This tasks calls `db:reset` to create the database, and calls `db:seed_fu` to seed the database.
Note: `db:setup` calls `db:seed` but this does nothing.
### Env variables
**MASS_INSERT**: Create millions of users (2m), projects (5m) and its
relations. It's highly recommended to run the seed with it to catch slow queries
while developing. Expect the process to take up to 20 extra minutes.
**LARGE_PROJECTS**: Create large projects (through import) from a predefined set of urls.
### Seeding issues for all or a given project
You can seed issues for all or a given project with the `gitlab:seed:issues`
......
......@@ -88,7 +88,7 @@ Gitlab::Seeder.quiet do
seeder = Gitlab::Seeder::Burndown.new(project)
seeder.seed!
else
Project.all.each do |project|
Project.not_mass_generated.each do |project|
seeder = Gitlab::Seeder::Burndown.new(project)
seeder.seed!
end
......
......@@ -128,7 +128,7 @@ class Gitlab::Seeder::Vulnerabilities
end
Gitlab::Seeder.quiet do
Project.joins(:ci_pipelines).distinct.all.sample(5).each do |project|
Project.joins(:ci_pipelines).not_mass_generated.distinct.all.sample(5).each do |project|
seeder = Gitlab::Seeder::Vulnerabilities.new(project)
seeder.seed!
end
......
# frozen_string_literal: true
# EE fixture
Gitlab::Seeder.quiet do
Project.all.sample(5).each do |project|
Project.not_mass_generated.sample(5).each do |project|
project.ci_pipelines.all.sample(2).each do |pipeline|
next if pipeline.source_pipeline
......
......@@ -32,7 +32,7 @@ class Gitlab::Seeder::Packages
end
Gitlab::Seeder.quiet do
Project.all.sample(5).each do |project|
Project.not_mass_generated.sample(5).each do |project|
Gitlab::Seeder::Packages.new(project.owner, project).seed
end
end
......@@ -14,7 +14,71 @@ end
module Gitlab
class Seeder
extend ActionView::Helpers::NumberHelper
ESTIMATED_INSERT_PER_MINUTE = 2_000_000
MASS_INSERT_ENV = 'MASS_INSERT'
module ProjectSeed
extend ActiveSupport::Concern
included do
scope :not_mass_generated, -> do
where.not("path LIKE '#{Gitlab::Seeder::Projects::MASS_INSERT_NAME_START}%'")
end
end
end
module UserSeed
extend ActiveSupport::Concern
included do
scope :not_mass_generated, -> do
where.not("username LIKE '#{Gitlab::Seeder::Users::MASS_INSERT_USERNAME_START}%'")
end
end
end
def self.with_mass_insert(size, model)
humanized_model_name = model.is_a?(String) ? model : model.model_name.human.pluralize(size)
if !ENV[MASS_INSERT_ENV] && !ENV['CI']
puts "\nSkipping mass insertion for #{humanized_model_name}."
puts "Consider running the seed with #{MASS_INSERT_ENV}=1"
return
end
humanized_size = number_with_delimiter(size)
estimative = estimated_time_message(size)
puts "\nCreating #{humanized_size} #{humanized_model_name}."
puts estimative
yield
puts "\n#{number_with_delimiter(size)} #{humanized_model_name} created!"
end
def self.estimated_time_message(size)
estimated_minutes = (size.to_f / ESTIMATED_INSERT_PER_MINUTE).round
humanized_minutes = 'minute'.pluralize(estimated_minutes)
if estimated_minutes.zero?
"Rough estimated time: less than a minute ⏰"
else
"Rough estimated time: #{estimated_minutes} #{humanized_minutes} ⏰"
end
end
def self.quiet
# Disable database insertion logs so speed isn't limited by ability to print to console
old_logger = ActiveRecord::Base.logger
ActiveRecord::Base.logger = nil
# Additional seed logic for models.
Project.include(ProjectSeed)
User.include(UserSeed)
mute_notifications
mute_mailer
......@@ -23,6 +87,7 @@ module Gitlab
yield
SeedFu.quiet = false
ActiveRecord::Base.logger = old_logger
puts "\nOK".color(:green)
end
......
......@@ -5,6 +5,10 @@ namespace :dev do
task setup: :environment do
ENV['force'] = 'yes'
Rake::Task["gitlab:setup"].invoke
# Make sure DB statistics are up to date.
ActiveRecord::Base.connection.execute('ANALYZE')
Rake::Task["gitlab:shell:setup"].invoke
end
......
......@@ -22,7 +22,7 @@ namespace :gitlab do
[project]
else
Project.find_each
Project.not_mass_generated.find_each
end
projects.each do |project|
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment