As we reevaluate how to best support and maintain Staging Ref in the future, we encourage development teams using this environment to highlight their use cases in the following issue: https://gitlab.com/gitlab-com/gl-infra/software-delivery/framework/software-delivery-framework-issue-tracker/-/issues/36.

Skip to content
Snippets Groups Projects
Commit a093c9d3 authored by Steve Azzopardi's avatar Steve Azzopardi Committed by Ash McKenzie
Browse files

feat: retry on error

What
---
Change the default `HTTP.Client` to
`github.com/hashicorp/go-retryablehttp.Client` to get automatic retries
and exponential backoff.

We retry the request 2 times resulting in 3 attempts of sending the
request, the min retry wait is 1 second, and the maximum is 15
seconds.

Hide the retry logic behind a temporary feature flag
`FF_GITLAB_SHELL_RETRYABLE_HTTP` to easily roll this out in GitLab.com.
When we verify that this works as expected we will remove
`FF_GITLAB_SHELL_RETRYABLE_HTTP` and have the retry logic as the default
logic.

Why
---
In https://gitlab.com/gitlab-com/gl-infra/production/-/issues/7979 users
end up seeing the following errors when trying to `git-clone(1)` a
repository locally on in CI.

```shell
remote: ===============================
remote:
remote: ERROR: Internal API unreachable
remote:
remote: ================================
```

When we look at the application logs we see the following error:

```json
{ "err": "http://gitlab-webservice-git.gitlab.svc:8181/api/v4/internal/allowed":
dial tcp 10.69.184.120:8181: connect: connection refused", "msg":
"Internal API unreachable"}
```

In
https://gitlab.com/gitlab-com/gl-infra/production/-/issues/7979#note_1222670120
we've correlated these `connection refused` errors with infrastructure
events that remove the git pods that are hosting
`gitlab-webservice-git` service. We could try to make the underlying
infrastructure more reactive to these changes as suggested in
https://gitlab.com/gitlab-com/gl-infra/production/-/issues/7979#note_1225164944
but we can still end up serving bad requests.

Implementing retry logic for 5xx or other errors would allow users to
still be able to `git-clone(1)` reposirories, although it being slower.
This is espically important during CI runs so users don't have to retry
jobs themselves.

Reference: https://gitlab.com/gitlab-com/gl-infra/production/-/issues/7979
Closes: https://gitlab.com/gitlab-org/gitlab-shell/-/issues/604


Signed-off-by: default avatarSteve Azzopardi <sazzopardi@gitlab.com>
parent c5b3accf
No related branches found
No related tags found
No related merge requests found
Showing
with 134 additions and 54 deletions
Loading
Loading
@@ -6,6 +6,8 @@ import (
"fmt"
"io"
"net/http"
"net/http/httptest"
"os"
"path"
"strings"
"testing"
Loading
Loading
@@ -18,9 +20,7 @@ import (
"gitlab.com/gitlab-org/gitlab-shell/v14/internal/testhelper"
)
var (
secret = "sssh, it's a secret"
)
var secret = "sssh, it's a secret"
func TestClients(t *testing.T) {
testhelper.PrepareTestRootDir(t)
Loading
Loading
@@ -70,6 +70,11 @@ func TestClients(t *testing.T) {
},
secret: "\n" + secret + "\n",
},
{
desc: "Retry client",
server: testserver.StartRetryHttpServer,
secret: secret,
},
}
for _, tc := range testCases {
Loading
Loading
@@ -297,3 +302,43 @@ func buildRequests(t *testing.T, relativeURLRoot string) []testserver.TestReques
return requests
}
func TestRetryableHTTPFeatureToggle(t *testing.T) {
t.Run("retryable http off", func(t *testing.T) {
os.Setenv("FF_GITLAB_SHELL_RETRYABLE_HTTP", "0")
reqAttempts := 0
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
reqAttempts++
w.WriteHeader(500)
}))
defer srv.Close()
httpClient, err := NewHTTPClientWithOpts(srv.URL, "/", "", "", 1, nil)
require.NoError(t, err)
client, err := NewGitlabNetClient("", "", "", httpClient)
require.NoError(t, err)
_, err = client.Get(context.Background(), "/")
require.EqualError(t, err, "Internal API unreachable")
require.Equal(t, 1, reqAttempts)
})
t.Run("retryable http on", func(t *testing.T) {
os.Setenv("FF_GITLAB_SHELL_RETRYABLE_HTTP", "1")
reqAttempts := 0
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
reqAttempts++
w.WriteHeader(500)
}))
defer srv.Close()
httpClient, err := NewHTTPClientWithOpts(srv.URL, "/", "", "", 1, nil)
require.NoError(t, err)
client, err := NewGitlabNetClient("", "", "", httpClient)
require.NoError(t, err)
_, err = client.Get(context.Background(), "/")
require.EqualError(t, err, "Internal API unreachable")
require.Equal(t, 3, reqAttempts)
})
}
Loading
Loading
@@ -11,6 +11,7 @@ import (
"time"
"github.com/golang-jwt/jwt/v4"
"github.com/hashicorp/go-retryablehttp"
"gitlab.com/gitlab-org/labkit/log"
)
Loading
Loading
@@ -88,7 +89,7 @@ func appendPath(host string, path string) string {
return strings.TrimSuffix(host, "/") + "/" + strings.TrimPrefix(path, "/")
}
func newRequest(ctx context.Context, method, host, path string, data interface{}) (*http.Request, error) {
func newRequest(ctx context.Context, method, host, path string, data interface{}) (*retryablehttp.Request, error) {
var jsonReader io.Reader
if data != nil {
jsonData, err := json.Marshal(data)
Loading
Loading
@@ -99,7 +100,7 @@ func newRequest(ctx context.Context, method, host, path string, data interface{}
jsonReader = bytes.NewReader(jsonData)
}
request, err := http.NewRequestWithContext(ctx, method, appendPath(host, path), jsonReader)
request, err := retryablehttp.NewRequestWithContext(ctx, method, appendPath(host, path), jsonReader)
if err != nil {
return nil, err
}
Loading
Loading
Loading
Loading
@@ -13,6 +13,7 @@ import (
"strings"
"time"
"github.com/hashicorp/go-retryablehttp"
"gitlab.com/gitlab-org/labkit/correlation"
"gitlab.com/gitlab-org/labkit/tracing"
)
Loading
Loading
@@ -25,12 +26,10 @@ const (
defaultReadTimeoutSeconds = 300
)
var (
ErrCafileNotFound = errors.New("cafile not found")
)
var ErrCafileNotFound = errors.New("cafile not found")
type HttpClient struct {
*http.Client
*retryablehttp.Client
Host string
}
Loading
Loading
@@ -101,10 +100,15 @@ func NewHTTPClientWithOpts(gitlabURL, gitlabRelativeURLRoot, caFile, caPath stri
return nil, errors.New("unknown GitLab URL prefix")
}
c := &http.Client{
Transport: correlation.NewInstrumentedRoundTripper(tracing.NewRoundTripper(transport)),
Timeout: readTimeout(readTimeoutSeconds),
c := retryablehttp.NewClient()
c.RetryMax = 0 // Retry logic is disabled by default
if os.Getenv("FF_GITLAB_SHELL_RETRYABLE_HTTP") == "1" { // Feature toggle for enabling retries on GitLab client.
c.RetryMax = 2
c.RetryWaitMax = 15 * time.Second
}
c.Logger = nil
c.HTTPClient.Transport = correlation.NewInstrumentedRoundTripper(tracing.NewRoundTripper(transport))
c.HTTPClient.Timeout = readTimeout(readTimeoutSeconds)
client := &HttpClient{Client: c, Host: host}
Loading
Loading
@@ -132,7 +136,6 @@ func buildSocketTransport(gitlabURL, gitlabRelativeURLRoot string) (*http.Transp
func buildHttpsTransport(hcc httpClientCfg, gitlabURL string) (*http.Transport, string, error) {
certPool, err := x509.SystemCertPool()
if err != nil {
certPool = x509.NewCertPool()
}
Loading
Loading
Loading
Loading
@@ -21,7 +21,7 @@ func TestReadTimeout(t *testing.T) {
require.NoError(t, err)
require.NotNil(t, client)
require.Equal(t, time.Duration(expectedSeconds)*time.Second, client.Client.Timeout)
require.Equal(t, time.Duration(expectedSeconds)*time.Second, client.HTTPClient.Timeout)
}
const (
Loading
Loading
Loading
Loading
@@ -59,6 +59,35 @@ func StartHttpServer(t *testing.T, handlers []TestRequestHandler) string {
return server.URL
}
func StartRetryHttpServer(t *testing.T, handlers []TestRequestHandler) string {
os.Setenv("FF_GITLAB_SHELL_RETRYABLE_HTTP", "1")
attempts := map[string]int{}
retryMiddileware := func(next func(w http.ResponseWriter, r *http.Request)) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
attempts[r.URL.String()+r.Method]++
if attempts[r.URL.String()+r.Method] == 1 {
w.WriteHeader(500)
return
}
http.HandlerFunc(next).ServeHTTP(w, r)
})
}
t.Helper()
h := http.NewServeMux()
for _, handler := range handlers {
h.Handle(handler.Path, retryMiddileware(handler.Handler))
}
server := httptest.NewServer(h)
t.Cleanup(func() { server.Close() })
return server.URL
}
func StartHttpsServer(t *testing.T, handlers []TestRequestHandler, clientCAPath string) string {
t.Helper()
Loading
Loading
Loading
Loading
@@ -6,6 +6,7 @@ require (
github.com/golang-jwt/jwt/v4 v4.4.1
github.com/grpc-ecosystem/go-grpc-middleware v1.3.0
github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0
github.com/hashicorp/go-retryablehttp v0.7.1
github.com/mattn/go-shellwords v1.0.11
github.com/mikesmitty/edkey v0.0.0-20170222072505-3356ea4e686a
github.com/otiai10/copy v1.4.2
Loading
Loading
@@ -48,6 +49,7 @@ require (
github.com/google/pprof v0.0.0-20210804190019-f964ff605595 // indirect
github.com/google/uuid v1.3.0 // indirect
github.com/googleapis/gax-go/v2 v2.2.0 // indirect
github.com/hashicorp/go-cleanhttp v0.5.1 // indirect
github.com/hashicorp/yamux v0.1.1 // indirect
github.com/jmespath/go-jmespath v0.4.0 // indirect
github.com/kr/text v0.2.0 // indirect
Loading
Loading
Loading
Loading
@@ -16,33 +16,31 @@ import (
"gitlab.com/gitlab-org/gitlab-shell/v14/internal/config"
)
var (
requests = []testserver.TestRequestHandler{
{
Path: "/api/v4/internal/discover",
Handler: func(w http.ResponseWriter, r *http.Request) {
if r.URL.Query().Get("key_id") == "1" || r.URL.Query().Get("username") == "alex-doe" {
body := map[string]interface{}{
"id": 2,
"username": "alex-doe",
"name": "Alex Doe",
}
json.NewEncoder(w).Encode(body)
} else if r.URL.Query().Get("username") == "broken_message" {
body := map[string]string{
"message": "Forbidden!",
}
w.WriteHeader(http.StatusForbidden)
json.NewEncoder(w).Encode(body)
} else if r.URL.Query().Get("username") == "broken" {
w.WriteHeader(http.StatusInternalServerError)
} else {
fmt.Fprint(w, "null")
var requests = []testserver.TestRequestHandler{
{
Path: "/api/v4/internal/discover",
Handler: func(w http.ResponseWriter, r *http.Request) {
if r.URL.Query().Get("key_id") == "1" || r.URL.Query().Get("username") == "alex-doe" {
body := map[string]interface{}{
"id": 2,
"username": "alex-doe",
"name": "Alex Doe",
}
json.NewEncoder(w).Encode(body)
} else if r.URL.Query().Get("username") == "broken_message" {
body := map[string]string{
"message": "Forbidden!",
}
},
w.WriteHeader(http.StatusForbidden)
json.NewEncoder(w).Encode(body)
} else if r.URL.Query().Get("username") == "broken" {
w.WriteHeader(http.StatusInternalServerError)
} else {
fmt.Fprint(w, "null")
}
},
}
)
},
}
func TestExecute(t *testing.T) {
url := testserver.StartSocketHttpServer(t, requests)
Loading
Loading
@@ -112,7 +110,7 @@ func TestFailingExecute(t *testing.T) {
{
desc: "When the API fails",
arguments: &commandargs.Shell{GitlabUsername: "broken"},
expectedError: "Failed to get username: Internal API error (500)",
expectedError: "Failed to get username: Internal API unreachable",
},
}
Loading
Loading
Loading
Loading
@@ -84,5 +84,5 @@ func TestFailingAPIExecute(t *testing.T) {
err := cmd.Execute(context.Background())
require.Empty(t, buffer.String())
require.EqualError(t, err, "Internal API available: FAILED - Internal API error (500)")
require.EqualError(t, err, "Internal API available: FAILED - Internal API unreachable")
}
Loading
Loading
@@ -17,9 +17,7 @@ import (
"gitlab.com/gitlab-org/gitlab-shell/v14/internal/gitlabnet/personalaccesstoken"
)
var (
requests []testserver.TestRequestHandler
)
var requests []testserver.TestRequestHandler
func setup(t *testing.T) {
requests = []testserver.TestRequestHandler{
Loading
Loading
@@ -147,7 +145,7 @@ func TestExecute(t *testing.T) {
GitlabKeyId: "broken",
SshArgs: []string{cmdname, "newtoken", "read_api,read_repository"},
},
expectedError: "Internal API error (500)",
expectedError: "Internal API unreachable",
},
{
desc: "Without KeyID or User",
Loading
Loading
Loading
Loading
@@ -18,9 +18,7 @@ import (
"gitlab.com/gitlab-org/gitlab-shell/v14/internal/gitlabnet/twofactorrecover"
)
var (
requests []testserver.TestRequestHandler
)
var requests []testserver.TestRequestHandler
func setup(t *testing.T) {
requests = []testserver.TestRequestHandler{
Loading
Loading
@@ -99,7 +97,7 @@ func TestExecute(t *testing.T) {
desc: "With API fails",
arguments: &commandargs.Shell{GitlabKeyId: "broken"},
answer: "yes\n",
expectedOutput: question + errorHeader + "Internal API error (500)\n",
expectedOutput: question + errorHeader + "Internal API unreachable\n",
},
{
desc: "With missing arguments",
Loading
Loading
Loading
Loading
@@ -136,7 +136,7 @@ func TestExecute(t *testing.T) {
{
desc: "With API fails",
arguments: &commandargs.Shell{GitlabKeyId: "broken"},
expectedOutput: errorHeader + "Internal API error (500)\n",
expectedOutput: errorHeader + "Internal API unreachable\n",
},
{
desc: "With missing arguments",
Loading
Loading
Loading
Loading
@@ -134,8 +134,8 @@ func (c *Config) HttpClient() (*client.HttpClient, error) {
return
}
tr := client.Transport
client.Transport = metrics.NewRoundTripper(tr)
tr := client.HTTPClient.Transport
client.HTTPClient.Transport = metrics.NewRoundTripper(tr)
c.httpClient = client
})
Loading
Loading
Loading
Loading
@@ -74,7 +74,7 @@ func TestFailedRequests(t *testing.T) {
{
desc: "With API fails",
args: &commandargs.Shell{GitlabKeyId: "broken", CommandType: commandargs.LfsAuthenticate, SshArgs: []string{"git-lfs-authenticate", repo, "download"}},
expectedOutput: "Internal API error (500)",
expectedOutput: "Internal API unreachable",
},
}
Loading
Loading
Loading
Loading
@@ -124,7 +124,7 @@ describe 'bin/gitlab-shell' do
it 'returns an error message when the API call fails without a message' do
_, stderr, status = run!(["-c/usr/share/webapps/gitlab-shell/bin/gitlab-shell", "username-broken"])
expect(stderr).to match(/Failed to get username: Internal API error \(500\)/)
expect(stderr).to match(/Failed to get username: Internal API unreachable/)
expect(status).not_to be_success
end
end
Loading
Loading
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment