Commit 1b274d0d authored by Kirill Smelkov's avatar Kirill Smelkov

Teach gitlab-workhorse to serve requests to get raw blobs

Currently GitLab serves requests to get raw blobs via Ruby-on-Rails code and
Unicorn. Because RoR/Unicorn is relatively heavyweight, in environment where
there are a lot of simultaneous requests to get raw blobs, this works very slow
and server is constantly overloaded.

On the other hand, to get raw blob content, we do not need anything from RoR
framework - we only need to have access to project git repository on filesystem,
and knowing whether access for getting data from there should be granted or
not. That means it is possible to adjust Nginx frontend to route '.../raw/....'
request to more lightweight and performant program which does this particular
task and that will be a net win.

As gitlab-workhorse is written in Go, and Go has good concurrency/parallelism
support and is generally much faster than Ruby, adding raw blob serving task to
it makes sense.

In this patch: we add infrastructure to process GET request for '/raw/...':

- extract project / ref and path from URL
- query auth backend for whether download access should be granted or not
- emit blob content via spawning external `git cat-file`

I've tried to mimic the output to be as close as the one emitted by RoR code,
with the idea that for users the change should be transparent.

As in this patch we do auth backend query for every request to get a blob, RoR
code is still loaded very much, so essentially there is no speedup yet:

  (on a 8-CPU i7-3770S with 16GB of RAM)

  # request goes to unicorn  (9 unicorn workers)
  $ ./wrk -c40 -d10 -t1 --latency https://[2001:67c:1254:e:8b::c776]:7777/root/slapos/raw/master/software/wendelin/software.cfg
  Running 10s test @ https://[2001:67c:1254:e:8b::c776]:7777/root/slapos/raw/master/software/wendelin/software.cfg
    1 threads and 40 connections
    Thread Stats   Avg      Stdev     Max   +/- Stdev
      Latency   553.06ms  166.39ms   1.29s    80.06%
      Req/Sec    69.53     23.12   140.00     71.72%
    Latency Distribution
       50%  525.41ms
       75%  615.63ms
       90%  774.48ms
       99%    1.05s
    695 requests in 10.02s, 1.38MB read
  Requests/sec:     69.38
  Transfer/sec:    141.47KB

  # request goes to gitlab-workhorse with the following added to nginx conf
  # location ~ ^/[\w\.-]+/[\w\.-]+/raw/ {
  #   error_page 418 = @gitlab-workhorse;
  #   return 418;
  # }
  $ ./wrk -c40 -d10 -t1 --latency https://[2001:67c:1254:e:8b::c776]:7777/root/slapos/raw/master/software/wendelin/software.cfg
  Running 10s test @ https://[2001:67c:1254:e:8b::c776]:7777/root/slapos/raw/master/software/wendelin/software.cfg
    1 threads and 40 connections
    Thread Stats   Avg      Stdev     Max   +/- Stdev
      Latency   549.37ms  220.53ms   1.69s    84.74%
      Req/Sec    71.01     25.49   160.00     70.71%
    Latency Distribution
       50%  514.66ms
       75%  584.32ms
       90%  767.46ms
       99%    1.37s
    709 requests in 10.01s, 1.26MB read
  Requests/sec:     70.83
  Transfer/sec:    128.79KB

In the next patch we'll cache requests to auth backend and that will improve
performance dramatically.
parent d1b215d5
// Handler for raw blob downloads
//
// Blobs are read via `git cat-file ...` with first querying authentication
// backend about download-access permission for containing repository.
package main
import (
"bufio"
"errors"
"fmt"
"io"
"log"
"net/http"
"net/http/httptest"
"regexp"
"strings"
)
// Reply from auth backend for "download from repo" authorization request
type AuthReply struct {
// raw reply from auth backend & preAuthorizeHandler().
// recorded so we can replay it to client in full
// if e.g. access is rejected.
RawReply *httptest.ResponseRecorder
// decoded auth reply
authorizationResponse
}
// Ask auth backend about whether download is ok for a project.
// Authorization is approved if AuthReply.RepoPath != "" on return
// Raw auth backend response is emitted to AuthReply.RawReply
func askAuthBackend(u *upstream, project, query string, header http.Header) AuthReply {
authReply := AuthReply{
RawReply: httptest.NewRecorder(),
}
// Request to auth backend to verify whether download is possible.
// - first option is via asking as `git fetch` would do, but on Rails
// side this supports only basic auth, not private token.
// - that's why we auth backend to authenticate as if it was request to
// get repo archive and propagate request query and header.
// url := project + ".git/info/refs?service=git-upload-pack"
url := project + "/repository/archive.zip"
if query != "" {
url += "?" + query
}
reqDownloadAccess, err := http.NewRequest("GET", url, nil)
if err != nil {
fail500(authReply.RawReply, fmt.Errorf("GET git-upload-pack: %v", err))
return authReply
}
for k, v := range header {
reqDownloadAccess.Header[k] = v
}
// Prepare everything and go through preAuthorizeHandler() that will send
// request to auth backend and analyze/parse the reply into r.authorizationResponse.
// it also logs/emits output in case of errors - we do not have to do it here
r := &gitRequest{
Request: reqDownloadAccess,
u: u,
}
preAuthorizeHandler(
func(w http.ResponseWriter, r *gitRequest) {
// if we ever get to this point - auth handler approved
// access and thus it is ok to download
authReply.authorizationResponse = r.authorizationResponse
}, "")(authReply.RawReply, r)
return authReply
}
// HTTP handler for `.../raw/<ref>/path`
var rawRe = regexp.MustCompile(`/raw/`)
func handleGetBlobRaw(w http.ResponseWriter, r *gitRequest) {
// Extract project & refpath
// <project>/raw/branch/file -> <project>, branch/file
u := r.Request.URL
rawLoc := rawRe.FindStringIndex(u.Path)
if rawLoc == nil {
fail500(w, errors.New("extract project name"))
return
}
project := u.Path[:rawLoc[0]]
refpath := u.Path[rawLoc[1]:]
// Query download access auth for this project
authReply := askAuthBackend(r.u, project, u.RawQuery, r.Request.Header)
if authReply.RepoPath == "" {
// access denied - copy auth reply to client in full -
// there are HTTP code and other headers / body relevant for
// about why access was denied.
for k, v := range authReply.RawReply.HeaderMap {
w.Header()[k] = v
}
w.WriteHeader(authReply.RawReply.Code)
_, err := io.Copy(w, authReply.RawReply.Body)
if err != nil {
logError(fmt.Errorf("writing authReply.RawReply.Body: %v", err))
}
return
}
// Access granted - we can emit the blob
emitBlob(w, authReply.RepoPath, refpath)
}
// Emit content of blob located at <ref>/path (jointly denoted as 'refpath') to output
func emitBlob(w http.ResponseWriter, repopath string, refpath string) {
// Communicate with `git cat-file --batch` trying refs from longest
// to shortest prefix in refpath. This way we find longest-match for
// ref and get blob sha1 and content in the end.
queryCmd := gitCommand("", "git", "--git-dir="+repopath, "cat-file", "--batch")
queryStdin, err := queryCmd.StdinPipe()
if err != nil {
fail500(w, fmt.Errorf("git cat-file --batch; stdin: %v", err))
return
}
defer queryStdin.Close()
queryStdout, err := queryCmd.StdoutPipe()
if err != nil {
fail500(w, fmt.Errorf("git cat-file --batch; stdout: %v", err))
return
}
defer queryStdout.Close()
queryReader := bufio.NewReader(queryStdout)
err = queryCmd.Start()
if err != nil {
fail500(w, fmt.Errorf("git cat-file --batch; start: %v", err))
return
}
defer cleanUpProcessGroup(queryCmd)
// refpath components as vector
refpathv := strings.Split(refpath, "/")
// scan from right to left and try to change '/' -> ':' and see if it
// creates a correct git object name. If it does - we read object
// content which follows.
var sha1, type_ string
var size int64
for i := len(refpathv); i > 0; i-- {
ref := strings.Join(refpathv[:i], "/")
path := strings.Join(refpathv[i:], "/")
_, err := fmt.Fprintf(queryStdin, "%s:%s\n", ref, path)
if err != nil {
fail500(w, fmt.Errorf("git cat-file --batch; write: %v", err))
return
}
reply, err := queryReader.ReadString('\n')
if err != nil {
fail500(w, fmt.Errorf("git cat-file --batch; read: %v", err))
return
}
// <object> SP missing LF
if strings.HasSuffix(reply, " missing\n") {
continue
}
// <sha1> SP <type> SP <size> LF
_, err = fmt.Sscanf(reply, "%s %s %d\n", &sha1, &type_, &size)
if err != nil {
fail500(w, fmt.Errorf("git cat-file --batch; reply parse: %v", err))
return
}
if type_ != "blob" {
log.Printf("git cat-file --batch-check; %v is not blob (is %v)", sha1, type_)
sha1 = "" // so it will return 404
}
// git object found
break
}
// Blob not found -> 404
if sha1 == "" {
http.Error(w, fmt.Sprintf("Blob for %v not found", refpath), http.StatusNotFound)
return
}
// Blob found - start writing response
w.Header().Set("Content-Disposition", "inline")
w.Header().Set("Content-Transfer-Encoding", "binary")
w.Header().Set("Content-Length", fmt.Sprintf("%d", size))
w.Header().Set("X-Content-Type-Options", "nosniff")
// net/http sniffs stream and automatically detects and sets
// Content-Type header. We do not have to do it ourselves.
w.Header().Set("Cache-Control", "private") // Rails sets this for IE compatibility
w.Header().Set("ETag", fmt.Sprintf(`"%s"`, sha1))
w.WriteHeader(http.StatusOK) // Don't bother with HTTP 500 from this point on, just return
// XXX better use queryStdout instead of queryReader, but we could be
// holding some tail bytes in queryReader after chat phase
_, err = io.CopyN(w, queryReader, size)
if err != nil {
logError(fmt.Errorf("io.CopyN: %v", err))
return
}
// close git stdin explicitly, so it can exit cleanly
err = queryStdin.Close()
if err != nil {
logError(fmt.Errorf("queryStdin.Close: %v", err))
return
}
err = queryCmd.Wait()
if err != nil {
logError(fmt.Errorf("wait: %v", err))
return
}
}
......@@ -2,6 +2,7 @@ package main
import (
"bytes"
"crypto/sha1"
"encoding/json"
"fmt"
"io/ioutil"
......@@ -453,3 +454,139 @@ func deniedXSendfileDownload(t *testing.T, contentFilename string, filePath stri
t.Fatal("Unexpected file contents in download")
}
}
// sha1(data) as human-readable string
func sha1s(data []byte) string {
return fmt.Sprintf("%x", sha1.Sum(data))
}
// download an URL
func download(t *testing.T, url string, h http.Header) (*http.Response, []byte) {
req, err := http.NewRequest("GET", url, nil)
if err != nil {
t.Fatal(err)
}
// copy header to request
for k, v := range h {
req.Header[k] = v
}
client := &http.Client{}
resp, err := client.Do(req)
if err != nil {
t.Fatal(err)
}
defer resp.Body.Close()
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
t.Fatal(err)
}
return resp, body
}
// Context for downloading & verifying paths under URL prefix
type DownloadContext struct {
t *testing.T
urlPrefix string
Header http.Header
}
func NewDownloadContext(t *testing.T, urlPrefix string) *DownloadContext {
h := make(http.Header)
return &DownloadContext{t, urlPrefix, h}
}
func (dl DownloadContext) download(path string) (*http.Response, []byte) {
return download(dl.t, dl.urlPrefix+path, dl.Header)
}
// download `path` and expect content sha1 to be `expectSha1`
func (dl DownloadContext) ExpectSha1(path, expectSha1 string) {
resp, out := dl.download(path)
if resp.StatusCode != 200 {
dl.t.Fatalf("Unexpected status code (expected 200, got %v)", resp.StatusCode)
}
outSha1 := sha1s(out)
if outSha1 != expectSha1 {
dl.t.Fatal("Unexpected content in blob download")
}
}
// download `path` and expect content data to be `expect`
func (dl DownloadContext) Expect(path, expect string) {
dl.ExpectSha1(path, sha1s([]byte(expect)))
}
// download `path` and expect HTTP status code to be `code`
func (dl DownloadContext) ExpectCode(path string, code int) {
resp, _ := dl.download(path)
if resp.StatusCode != code {
dl.t.Fatalf("Unexpected status code (expected %v, got %v)", code, resp.StatusCode)
}
}
func TestBlobDownload(t *testing.T) {
// Prepare test server and "all-ok" auth backend
ts := testAuthServer(nil, 200, gitOkBody(t))
defer ts.Close()
ws := startWorkhorseServer(ts.URL)
defer ws.Close()
dl := NewDownloadContext(t, fmt.Sprintf("%s/%s/raw", ws.URL, testProject))
dl.Expect("/5f923865/README.md", "testme\n======\n\nSample repo for testing gitlab features\n")
dl.ExpectSha1("/5f923865/README.md", "5f7af35c185a9e5face2f4afb6d7c4f00328d04c")
dl.ExpectSha1("/5f923865/files/ruby/popen.rb", "68990cc20fa74383358797a27967fa2b45d7d8f6")
dl.ExpectSha1("/874797c3/files/ruby/popen.rb", "4c266708f2bfd7ca3fed3f7ec74253f92ff3fe73")
dl.ExpectCode("/master/non-existing-file", 404)
}
func TestDeniedBlobDownload(t *testing.T) {
// Prepare test server and "all-deny" auth backend
ts := testAuthServer(nil, 403, "Access denied")
defer ts.Close()
ws := startWorkhorseServer(ts.URL)
defer ws.Close()
dl := NewDownloadContext(t, fmt.Sprintf("%s/%s/raw", ws.URL, testProject))
dl.ExpectCode("/5f923865/README.md", 403)
dl.ExpectCode("/5f923865/files/ruby/popen.rb", 403)
dl.ExpectCode("/874797c3/files/ruby/popen.rb", 403)
dl.ExpectCode("/master/non-existing-file", 403)
}
func TestPrivateBlobDownload(t *testing.T) {
// Prepare test server and auth backend:
// access is ok if token is provided either via query or via header
ts := testServerWithHandler(nil, func(w http.ResponseWriter, r *http.Request) {
log.Println("UPSTREAM", r.Method, r.URL)
token_ok1 := r.URL.Query().Get("aaa_token") == "TOKEN-4AAA"
token_ok2 := r.Header.Get("BBB-TOKEN") == "TOKEN-4BBB"
if !(token_ok1 || token_ok2) {
w.WriteHeader(403)
fmt.Fprintf(w, "Access denied")
return
}
data, err := json.Marshal(gitOkBody(t))
if err != nil {
t.Fatal(err)
}
w.WriteHeader(200)
w.Write(data)
})
defer ts.Close()
ws := startWorkhorseServer(ts.URL)
defer ws.Close()
dl := NewDownloadContext(t, fmt.Sprintf("%s/%s/raw", ws.URL, testProject))
dl.ExpectCode("/5f923865/README.md", 403)
dl.ExpectCode("/5f923865/README.md?bbb_token=TOKEN-4BBB", 403)
dl.ExpectCode("/5f923865/README.md?aaa_token=TOKEN-4AAA", 200)
dl.ExpectSha1("/5f923865/README.md?aaa_token=TOKEN-4AAA", "5f7af35c185a9e5face2f4afb6d7c4f00328d04c")
dl.Header.Add("AAA-TOKEN", "TOKEN-4AAA")
dl.ExpectCode("/5f923865/README.md", 403)
dl.Header.Add("BBB-TOKEN", "TOKEN-4BBB")
dl.ExpectCode("/5f923865/README.md", 200)
dl.ExpectSha1("/5f923865/README.md", "5f7af35c185a9e5face2f4afb6d7c4f00328d04c")
}
......@@ -75,6 +75,7 @@ var gitServices = [...]gitService{
gitService{"GET", regexp.MustCompile(`/repository/archive.tar.gz\z`), repoPreAuthorizeHandler(handleGetArchive)},
gitService{"GET", regexp.MustCompile(`/repository/archive.tar.bz2\z`), repoPreAuthorizeHandler(handleGetArchive)},
gitService{"GET", regexp.MustCompile(`/uploads/`), handleSendFile},
gitService{"GET", regexp.MustCompile(`/raw/`), handleGetBlobRaw},
// Git LFS
gitService{"PUT", regexp.MustCompile(`/gitlab-lfs/objects/([0-9a-f]{64})/([0-9]+)\z`), lfsAuthorizeHandler(handleStoreLfsObject)},
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment