Download Knapsack and flaky specs report from latest master artifacts

This dogfood artifacts and download the Knapsack report, flaky specs report, and Crystalball report from the latest successful master pipeline triggered by @gitlab-bot (this should be a scheduled pipeline for which we're sure update-tests-metadata was run). This also remove the need to upload these reports to S3 since they're already uploaded as artifacts. Signed-off-by: Rémy Coutable <remy@rymai.me>

Download Knapsack and flaky specs report from latest master artifacts
This dogfood artifacts and download the Knapsack report, flaky specs report, and Crystalball report from the latest successful master pipeline triggered by @gitlab-bot (this should be a scheduled pipeline for which we're sure update-tests-metadata was run). This also remove the need to upload these reports to S3 since they're already uploaded as artifacts. Signed-off-by: Rémy Coutable <remy@rymai.me>
40919c67 · Rémy Coutable · f3c9c0e1 · 40919c67 · 40919c67 · 40919c67
Commit 40919c67 authored Oct 28, 2020 by Rémy Coutable
4 changed files
--- a/.gitlab/ci/test-metadata.gitlab-ci.yml
+++ b/.gitlab/ci/test-metadata.gitlab-ci.yml
 .tests-metadata-state:
-  variables:
-    TESTS_METADATA_S3_BUCKET: "gitlab-ce-cache"
+  image: ruby:2.7
  before_script:
    - source scripts/utils.sh
  artifacts:
@@ -17,7 +16,8 @@ retrieve-tests-metadata:
    - .test-metadata:rules:retrieve-tests-metadata
  stage: prepare
  script:
-    - source scripts/rspec_helpers.sh
+    - install_api_client_dependencies_with_apt
+    - source ./scripts/rspec_helpers.sh
    - retrieve_tests_metadata

 update-tests-metadata:

--- a/doc/development/testing_guide/ci.md
+++ b/doc/development/testing_guide/ci.md
@@ -12,8 +12,8 @@ Our current CI parallelization setup is as follows:

 1. The `retrieve-tests-metadata` job in the `prepare` stage ensures we have a
   `knapsack/report-master.json` file:
-   - The `knapsack/report-master.json` file is fetched from S3, if it's not here
-     we initialize the file with `{}`.
+   - The `knapsack/report-master.json` file is fetched from the latest `master` pipeline which runs `update-tests-metadata`
+     (for now it's the 2-hourly scheduled master pipeline), if it's not here we initialize the file with `{}`.
 1. Each `[rspec|rspec-ee] [unit|integration|system|geo] n m` job are run with
   `knapsack rspec` and should have an evenly distributed share of tests:
   - It works because the jobs have access to the `knapsack/report-master.json`
@@ -25,7 +25,7 @@ Our current CI parallelization setup is as follows:
 1. The `update-tests-metadata` job (which only runs on scheduled pipelines for
   [the canonical project](https://gitlab.com/gitlab-org/gitlab) takes all the
   `knapsack/rspec*_pg_*.json` files and merge them all together into a single
-   `knapsack/report-master.json` file that is then uploaded to S3.
+   `knapsack/report-master.json` file that is saved as artifact.

 After that, the next pipeline will use the up-to-date `knapsack/report-master.json` file.


--- a/scripts/rspec_helpers.sh
+++ b/scripts/rspec_helpers.sh
 #!/usr/bin/env bash

 function retrieve_tests_metadata() {
-  mkdir -p knapsack/ rspec_flaky/ rspec_profiling/
+  mkdir -p crystalball/ knapsack/ rspec_flaky/ rspec_profiling/
+
+  local project_path="gitlab-org%2Fgitlab"
+  local latest_scheduled_master_pipeline_id
+  local job_id
+  latest_scheduled_master_pipeline_id=$(get_pipelines "${project_path}" "status=success&ref=master&username=gitlab-bot" | jq "first | .id")
+  job_id=$(get_job_id "${project_path}" "${latest_scheduled_master_pipeline_id}" "update-tests-metadata" "scope=success")

  if [[ ! -f "${KNAPSACK_RSPEC_SUITE_REPORT_PATH}" ]]; then
-    wget -O "${KNAPSACK_RSPEC_SUITE_REPORT_PATH}" "http://${TESTS_METADATA_S3_BUCKET}.s3.amazonaws.com/${KNAPSACK_RSPEC_SUITE_REPORT_PATH}" || echo "{}" > "${KNAPSACK_RSPEC_SUITE_REPORT_PATH}"
+    get_job_artifact "${project_path}" "${job_id}" "${KNAPSACK_RSPEC_SUITE_REPORT_PATH}" > "${KNAPSACK_RSPEC_SUITE_REPORT_PATH}" || echo "{}" > "${KNAPSACK_RSPEC_SUITE_REPORT_PATH}"
  fi

  if [[ ! -f "${FLAKY_RSPEC_SUITE_REPORT_PATH}" ]]; then
-    wget -O "${FLAKY_RSPEC_SUITE_REPORT_PATH}" "http://${TESTS_METADATA_S3_BUCKET}.s3.amazonaws.com/${FLAKY_RSPEC_SUITE_REPORT_PATH}" || echo "{}" > "${FLAKY_RSPEC_SUITE_REPORT_PATH}"
+    get_job_artifact "${project_path}" "${job_id}" "${FLAKY_RSPEC_SUITE_REPORT_PATH}" > "${FLAKY_RSPEC_SUITE_REPORT_PATH}" || echo "{}" > "${FLAKY_RSPEC_SUITE_REPORT_PATH}"
  fi
+
+  # Disabled for now
+  # if [[ ! -f "${RSPEC_PACKED_TESTS_MAPPING_PATH}" ]]; then
+  #   (get_job_artifact "${project_path}" "${job_id}" "${RSPEC_PACKED_TESTS_MAPPING_PATH}.gz" > "${RSPEC_PACKED_TESTS_MAPPING_PATH}.gz" && gzip -d "${RSPEC_PACKED_TESTS_MAPPING_PATH}.gz") || echo "{}" > "${RSPEC_PACKED_TESTS_MAPPING_PATH}"
+  # fi
+  #
+  # scripts/unpack-test-mapping "${RSPEC_PACKED_TESTS_MAPPING_PATH}" "${RSPEC_TESTS_MAPPING_PATH}"
 }

 function update_tests_metadata() {
  echo "{}" > "${KNAPSACK_RSPEC_SUITE_REPORT_PATH}"

  scripts/merge-reports "${KNAPSACK_RSPEC_SUITE_REPORT_PATH}" knapsack/rspec*.json
-  if [[ -n "${TESTS_METADATA_S3_BUCKET}" ]]; then
-    if [[ "$CI_PIPELINE_SOURCE" == "schedule" ]]; then
-      scripts/sync-reports put "${TESTS_METADATA_S3_BUCKET}" "${KNAPSACK_RSPEC_SUITE_REPORT_PATH}"
-    else
-      echo "Not uplaoding report to S3 as the pipeline is not a scheduled one."
-    fi
-  fi
-
  rm -f knapsack/rspec*.json

-  scripts/merge-reports "${FLAKY_RSPEC_SUITE_REPORT_PATH}" rspec_flaky/all_*.json
-
  export FLAKY_RSPEC_GENERATE_REPORT="true"
+  scripts/merge-reports "${FLAKY_RSPEC_SUITE_REPORT_PATH}" rspec_flaky/all_*.json
  scripts/flaky_examples/prune-old-flaky-examples "${FLAKY_RSPEC_SUITE_REPORT_PATH}"
-
-  if [[ -n ${TESTS_METADATA_S3_BUCKET} ]]; then
-    if [[ "$CI_PIPELINE_SOURCE" == "schedule" ]]; then
-      scripts/sync-reports put "${TESTS_METADATA_S3_BUCKET}" "${FLAKY_RSPEC_SUITE_REPORT_PATH}"
-    else
-      echo "Not uploading report to S3 as the pipeline is not a scheduled one."
-    fi
-  fi
-
  rm -f rspec_flaky/all_*.json rspec_flaky/new_*.json

  if [[ "$CI_PIPELINE_SOURCE" == "schedule" ]]; then
@@ -48,16 +43,6 @@ function update_tests_metadata() {
  fi
 }

-function retrieve_tests_mapping() {
-  mkdir -p crystalball/
-
-  if [[ ! -f "${RSPEC_PACKED_TESTS_MAPPING_PATH}" ]]; then
-    (wget -O "${RSPEC_PACKED_TESTS_MAPPING_PATH}.gz" "http://${TESTS_METADATA_S3_BUCKET}.s3.amazonaws.com/${RSPEC_PACKED_TESTS_MAPPING_PATH}.gz" && gzip -d "${RSPEC_PACKED_TESTS_MAPPING_PATH}.gz") || echo "{}" > "${RSPEC_PACKED_TESTS_MAPPING_PATH}"
-  fi
-
-  scripts/unpack-test-mapping "${RSPEC_PACKED_TESTS_MAPPING_PATH}" "${RSPEC_TESTS_MAPPING_PATH}"
-}
-
 function update_tests_mapping() {
  if ! crystalball_rspec_data_exists; then
    echo "No crystalball rspec data found."
@@ -65,19 +50,8 @@ function update_tests_mapping() {
  fi

  scripts/generate-test-mapping "${RSPEC_TESTS_MAPPING_PATH}" crystalball/rspec*.yml
-
  scripts/pack-test-mapping "${RSPEC_TESTS_MAPPING_PATH}" "${RSPEC_PACKED_TESTS_MAPPING_PATH}"
-
  gzip "${RSPEC_PACKED_TESTS_MAPPING_PATH}"
-
-  if [[ -n "${TESTS_METADATA_S3_BUCKET}" ]]; then
-    if [[ "$CI_PIPELINE_SOURCE" == "schedule" ]]; then
-      scripts/sync-reports put "${TESTS_METADATA_S3_BUCKET}" "${RSPEC_PACKED_TESTS_MAPPING_PATH}.gz"
-    else
-      echo "Not uploading report to S3 as the pipeline is not a scheduled one."
-    fi
-  fi
-
  rm -f crystalball/rspec*.yml
 }


--- a/scripts/utils.sh
+++ b/scripts/utils.sh
@@ -87,24 +87,38 @@ function echosuccess() {
  fi
 }

-function get_job_id() {
-  local job_name="${1}"
+function get_pipelines() {
+  local project_id="${1}"
  local query_string="${2:+&${2}}"
+
+  local url="https://gitlab.com/api/v4/projects/${project_id}/pipelines?per_page=100${query_string}"
+  echoinfo "GET ${url}"
+
+  curl --silent --show-error "${url}"
+}
+
+function get_job_id() {
+  local project_id="${1}"
+  local pipeline_id="${2}"
+  local job_name="${3}"
+  local query_string="${4:+&${4}}"
  local api_token="${API_TOKEN-${GITLAB_BOT_MULTI_PROJECT_PIPELINE_POLLING_TOKEN}}"
-  if [ -z "${api_token}" ]; then
-    echoerr "Please provide an API token with \$API_TOKEN or \$GITLAB_BOT_MULTI_PROJECT_PIPELINE_POLLING_TOKEN."
-    return
+  local curl_opts
+  if [ -n "${api_token}" ]; then
+    curl_opts="--header 'PRIVATE-TOKEN: ${api_token}'"
+  else
+    echoinfo "No API token given with \$API_TOKEN or \$GITLAB_BOT_MULTI_PROJECT_PIPELINE_POLLING_TOKEN."
  fi

  local max_page=3
  local page=1

  while true; do
-    local url="https://gitlab.com/api/v4/projects/${CI_PROJECT_ID}/pipelines/${CI_PIPELINE_ID}/jobs?per_page=100&page=${page}${query_string}"
+    local url="https://gitlab.com/api/v4/projects/${project_id}/pipelines/${pipeline_id}/jobs?per_page=100&page=${page}${query_string}"
    echoinfo "GET ${url}"

    local job_id
-    job_id=$(curl --silent --show-error --header "PRIVATE-TOKEN: ${api_token}" "${url}" | jq "map(select(.name == \"${job_name}\")) | map(.id) | last")
+    job_id=$(curl --silent --show-error ${curl_opts} "${url}" | jq "map(select(.name == \"${job_name}\")) | map(.id) | last")
    [[ "${job_id}" == "null" && "${page}" -lt "$max_page" ]] || break

    let "page++"
@@ -118,10 +132,21 @@ function get_job_id() {
  fi
 }

+function get_job_artifact() {
+  local project_id="${1}"
+  local job_id="${2}"
+  local artifact_path="${3}"
+
+  local url="https://gitlab.com/api/v4/projects/${project_id}/jobs/${job_id}/artifacts/${artifact_path}"
+  echoinfo "GET ${url}"
+
+  curl --silent --show-error "${url}"
+}
+
 function play_job() {
  local job_name="${1}"
  local job_id
-  job_id=$(get_job_id "${job_name}" "scope=manual");
+  job_id=$(get_job_id "${CI_PROJECT_ID}" "${CI_PIPELINE_ID}" "${job_name}" "scope=manual");
  if [ -z "${job_id}" ]; then return; fi

  local api_token="${API_TOKEN-${GITLAB_BOT_MULTI_PROJECT_PIPELINE_POLLING_TOKEN}}"
@@ -140,7 +165,7 @@ function play_job() {

 function fail_pipeline_early() {
  local dont_interrupt_me_job_id
-  dont_interrupt_me_job_id=$(get_job_id 'dont-interrupt-me' 'scope=success')
+  dont_interrupt_me_job_id=$(get_job_id "${CI_PROJECT_ID}" "${CI_PIPELINE_ID}" "dont-interrupt-me" "scope=success")

  if [[ -n "${dont_interrupt_me_job_id}" ]]; then
    echoinfo "This pipeline cannot be interrupted due to \`dont-interrupt-me\` job ${dont_interrupt_me_job_id}"