Skip to content

Commit cfd505c

Browse files
Fixed and Improved Gitlab Project Metadata in-memory cache
Earlier, we used a map to temporarily store GitLab project metadata. While maps work well for small datasets, they don’t scale efficiently for larger ones. There was also a bug in the caching logic: when storing entries, we used the GitLab HTTPURLToRepo field as the cache key, but when retrieving entries, we used the normalized URL. As a result, cache lookups almost never succeeded, and the cache kept growing without being effectively used. With this fix, we’ve replaced the map with an LRU cache, which is better suited for this use case. The cache now stores up to 15,000 entries for one hour, after which the LRU mechanism automatically evicts old items, keeping memory usage under control. We also consistently use the normalized URL for both setting and fetching cache entries.
1 parent f70218b commit cfd505c

File tree

4 files changed

+179
-139
lines changed

4 files changed

+179
-139
lines changed

pkg/sources/gitlab/gitlab.go

Lines changed: 140 additions & 118 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,14 @@ import (
1010
"sync"
1111
"time"
1212

13+
gogit "github.com/go-git/go-git/v5"
14+
"github.com/gobwas/glob"
15+
gitlab "gitlab.com/gitlab-org/api/client-go"
16+
"golang.org/x/oauth2"
17+
"golang.org/x/sync/errgroup"
18+
"google.golang.org/protobuf/proto"
19+
"google.golang.org/protobuf/types/known/anypb"
20+
1321
"github.com/trufflesecurity/trufflehog/v3/pkg/common"
1422
"github.com/trufflesecurity/trufflehog/v3/pkg/context"
1523
"github.com/trufflesecurity/trufflehog/v3/pkg/feature"
@@ -20,20 +28,12 @@ import (
2028
"github.com/trufflesecurity/trufflehog/v3/pkg/sanitizer"
2129
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
2230
"github.com/trufflesecurity/trufflehog/v3/pkg/sources/git"
23-
24-
gogit "github.com/go-git/go-git/v5"
25-
"github.com/gobwas/glob"
26-
gitlab "gitlab.com/gitlab-org/api/client-go"
27-
"golang.org/x/oauth2"
28-
"golang.org/x/sync/errgroup"
29-
"google.golang.org/protobuf/proto"
30-
"google.golang.org/protobuf/types/known/anypb"
3131
)
3232

3333
const SourceType = sourcespb.SourceType_SOURCE_TYPE_GITLAB
3434

35-
// This is the URL for gitlab hosted at gitlab.com
36-
const gitlabBaseURL = "https://gitlab.com/"
35+
// Base URL for GitLab Cloud (hosted at gitlab.com)
36+
const gitlabCloudBaseUrl = "https://gitlab.com/"
3737

3838
type Source struct {
3939
name string
@@ -72,16 +72,15 @@ type Source struct {
7272
sources.CommonSourceUnitUnmarshaller
7373

7474
useAuthInUrl bool
75-
76-
clonePath string
77-
noCleanup bool
75+
clonePath string
76+
noCleanup bool
7877

7978
printLegacyJSON bool
8079

8180
projectsPerPage int64
8281

8382
// cache of repo URL to project info, used when generating metadata for chunks
84-
repoToProjCache repoToProjectCache
83+
*projectMetadataCache
8584
}
8685

8786
// WithCustomContentWriter sets the useCustomContentWriter flag on the source.
@@ -242,11 +241,12 @@ func (s *Source) Init(ctx context.Context, name string, jobId sources.JobID, sou
242241
Timestamp: sanitizer.UTF8(timestamp),
243242
Line: line,
244243
}
245-
proj, ok := s.repoToProjCache.get(repository)
244+
// check for project metadata in the cache
245+
project, ok := s.projectMetadataCache.get(repository)
246246
if ok {
247-
gitlabMetadata.ProjectId = int64(proj.id)
248-
gitlabMetadata.ProjectName = proj.name
249-
gitlabMetadata.ProjectOwner = proj.owner
247+
gitlabMetadata.ProjectId = project.id
248+
gitlabMetadata.ProjectName = project.name
249+
gitlabMetadata.ProjectOwner = project.owner
250250
}
251251

252252
return &source_metadatapb.MetaData{
@@ -260,9 +260,7 @@ func (s *Source) Init(ctx context.Context, name string, jobId sources.JobID, sou
260260
}
261261
s.git = git.NewGit(cfg)
262262

263-
s.repoToProjCache = repoToProjectCache{
264-
cache: make(map[string]*project),
265-
}
263+
s.projectMetadataCache = NewProjectMetadataCache()
266264

267265
return nil
268266
}
@@ -314,8 +312,10 @@ func (s *Source) Chunks(ctx context.Context, chunksChan chan *sources.Chunk, tar
314312

315313
} else {
316314
gitlabReposEnumerated.WithLabelValues(s.name).Set(float64(len(repos)))
317-
// ensure project details for specified repos are cached
318-
// this is required to populate metadata during chunking
315+
// Ensure project details for the specified repositories are cached.
316+
// This is required to populate metadata during chunking.
317+
// Note: Repository URLs are already normalized, so the cache check
318+
// uses the normalized repo URL directly.
319319
for _, repo := range repos {
320320
s.ensureProjectInCache(ctx, repo)
321321
}
@@ -586,7 +586,10 @@ func (s *Source) getAllProjectRepos(
586586
}
587587
// Report the unit.
588588
ctx.Logger().V(3).Info("accepting project")
589-
s.cacheGitlabProject(proj)
589+
590+
// Cache the GitLab project metadata.
591+
s.cacheGitlabProjectMetadata(ctx, proj)
592+
590593
unit := git.SourceUnit{Kind: git.UnitRepo, ID: proj.HTTPURLToRepo}
591594
gitlabReposEnumerated.WithLabelValues(s.name).Inc()
592595
projectsWithNamespace = append(projectsWithNamespace, proj.NameWithNamespace)
@@ -630,7 +633,7 @@ func (s *Source) getAllProjectRepos(
630633
Owned: gitlab.Ptr(false),
631634
}
632635

633-
if s.url != gitlabBaseURL {
636+
if s.url != gitlabCloudBaseUrl {
634637
listGroupsOptions.AllAvailable = gitlab.Ptr(true)
635638
}
636639

@@ -722,7 +725,7 @@ func (s *Source) getAllProjectReposV2(
722725
}
723726

724727
// for gitlab.com instance, include only projects where the user is a member.
725-
if s.url == gitlabBaseURL {
728+
if s.url == gitlabCloudBaseUrl {
726729
projectQueryOptions.Membership = gitlab.Ptr(true)
727730
}
728731

@@ -783,7 +786,9 @@ func (s *Source) getAllProjectReposV2(
783786
// report the unit.
784787
projCtx.Logger().V(3).Info("accepting project")
785788

786-
s.cacheGitlabProject(project)
789+
// Cache the GitLab project metadata.
790+
s.cacheGitlabProjectMetadata(projCtx, project)
791+
787792
unit := git.SourceUnit{Kind: git.UnitRepo, ID: project.HTTPURLToRepo}
788793
gitlabReposEnumerated.WithLabelValues(s.name).Inc()
789794

@@ -837,7 +842,7 @@ func (s *Source) getAllProjectReposInGroups(
837842
}
838843

839844
// For non gitlab.com instances, you might want to adjust access levels
840-
if s.url != gitlabBaseURL {
845+
if s.url != gitlabCloudBaseUrl {
841846
projectOpts.MinAccessLevel = gitlab.Ptr(gitlab.GuestPermissions)
842847
}
843848

@@ -899,7 +904,9 @@ func (s *Source) getAllProjectReposInGroups(
899904
// report the unit.
900905
projCtx.Logger().V(3).Info("accepting project")
901906

902-
s.cacheGitlabProject(proj)
907+
// Cache the GitLab project metadata.
908+
s.cacheGitlabProjectMetadata(projCtx, proj)
909+
903910
unit := git.SourceUnit{Kind: git.UnitRepo, ID: proj.HTTPURLToRepo}
904911
gitlabReposEnumerated.WithLabelValues(s.name).Inc()
905912
projectsWithNamespace = append(projectsWithNamespace, proj.NameWithNamespace)
@@ -1022,77 +1029,6 @@ func (s *Source) WithScanOptions(scanOptions *git.ScanOptions) {
10221029
s.scanOptions = scanOptions
10231030
}
10241031

1025-
func buildIgnorer(include, exclude []string, onCompile func(err error, pattern string)) func(repo string) bool {
1026-
1027-
// compile and load globRepoFilter
1028-
globRepoFilter := newGlobRepoFilter(include, exclude, onCompile)
1029-
1030-
f := func(repo string) bool {
1031-
if !globRepoFilter.includeRepo(repo) || globRepoFilter.ignoreRepo(repo) {
1032-
return true
1033-
}
1034-
return false
1035-
}
1036-
1037-
return f
1038-
}
1039-
1040-
func normalizeRepos(repos []string) ([]string, []error) {
1041-
// Optimistically allocate space for all valid repositories.
1042-
validRepos := make([]string, 0, len(repos))
1043-
var errs []error
1044-
for _, prj := range repos {
1045-
repo, err := giturl.NormalizeGitlabRepo(prj)
1046-
if err != nil {
1047-
errs = append(errs, fmt.Errorf("unable to normalize gitlab repo url %q: %w", prj, err))
1048-
continue
1049-
}
1050-
1051-
validRepos = append(validRepos, repo)
1052-
}
1053-
return validRepos, errs
1054-
}
1055-
1056-
// normalizeGitlabEndpoint ensures that if an endpoint is going to gitlab.com, we use https://gitlab.com/ as the endpoint.
1057-
// If we see the protocol is http, we error, because this shouldn't be used.
1058-
// Otherwise, it ensures we are using https as our protocol, if none was provided.
1059-
func normalizeGitlabEndpoint(gitlabEndpoint string) (string, error) {
1060-
if gitlabEndpoint == "" {
1061-
return gitlabBaseURL, nil
1062-
}
1063-
1064-
gitlabURL, err := url.Parse(gitlabEndpoint)
1065-
if err != nil {
1066-
return "", err
1067-
}
1068-
1069-
// We probably didn't receive a URL with a scheme, which messed up the parsing.
1070-
if gitlabURL.Host == "" {
1071-
gitlabURL, err = url.Parse("https://" + gitlabEndpoint)
1072-
if err != nil {
1073-
return "", err
1074-
}
1075-
}
1076-
1077-
// If the host is gitlab.com, this is the cloud version, which has only one valid endpoint.
1078-
if gitlabURL.Host == "gitlab.com" {
1079-
return gitlabBaseURL, nil
1080-
}
1081-
1082-
// Beyond here, on-prem gitlab is being used, so we have to mostly leave things as-is.
1083-
1084-
if gitlabURL.Scheme != "https" {
1085-
return "", fmt.Errorf("https was not used as URL scheme, but is required. Please use https")
1086-
}
1087-
1088-
// The gitlab library wants trailing slashes.
1089-
if !strings.HasSuffix(gitlabURL.Path, "/") {
1090-
gitlabURL.Path = gitlabURL.Path + "/"
1091-
}
1092-
1093-
return gitlabURL.String(), nil
1094-
}
1095-
10961032
// Enumerate reports all GitLab repositories to be scanned to the reporter. If
10971033
// none are configured, it will find all repositories within all projects that
10981034
// the configured user has access to, while respecting the configured ignore
@@ -1191,23 +1127,25 @@ func (s *Source) ChunkUnit(ctx context.Context, unit sources.SourceUnit, reporte
11911127
return s.git.ScanRepo(ctx, repo, path, s.scanOptions, reporter)
11921128
}
11931129

1194-
// ensureProjectInCache checks if the project for the given repo URL is in the cache,
1195-
// and if not, queries the GitLab API to fetch the project and adds it to the cache.
1196-
func (s *Source) ensureProjectInCache(ctx context.Context, repoUrl string) {
1197-
// check if project is already in cache
1198-
if _, ok := s.repoToProjCache.get(repoUrl); ok {
1130+
// ensureProjectInCache ensures that the project for the given repository URL
1131+
// exists in the cache. If not, it fetches the project from the GitLab API
1132+
// and stores it in the cache.
1133+
func (s *Source) ensureProjectInCache(ctx context.Context, repoURL string) {
1134+
// Check if the project is already cached.
1135+
if _, ok := s.projectMetadataCache.get(repoURL); ok {
1136+
ctx.Logger().V(5).Info("cache hit: found project metadata in the cache", "cache_key", repoURL)
11991137
return
12001138
}
12011139

1202-
// query project
1203-
proj, err := s.getGitlabProject(ctx, repoUrl)
1140+
// Fetch the project from GitLab.
1141+
project, err := s.getGitlabProject(ctx, repoURL)
12041142
if err != nil {
1205-
ctx.Logger().Error(err, "could not fetch project for repo", "repo", repoUrl)
1143+
ctx.Logger().Error(err, "failed to fetch GitLab project", "repo", repoURL)
12061144
return
12071145
}
12081146

1209-
// add to cache
1210-
s.cacheGitlabProject(proj)
1147+
// Cache the project metadata.
1148+
s.cacheGitlabProjectMetadata(ctx, project)
12111149
}
12121150

12131151
func (s *Source) getGitlabProject(ctx context.Context, repoUrl string) (*gitlab.Project, error) {
@@ -1230,16 +1168,100 @@ func (s *Source) getGitlabProject(ctx context.Context, repoUrl string) (*gitlab.
12301168
return proj, nil
12311169
}
12321170

1233-
func (s *Source) cacheGitlabProject(gitlabProj *gitlab.Project) {
1171+
// cacheGitlabProjectMetadata caches GitLab project metadata keyed by the
1172+
// normalized GitLab repository URL.
1173+
func (s *Source) cacheGitlabProjectMetadata(ctx context.Context, glProject *gitlab.Project) {
12341174
proj := &project{
1235-
id: gitlabProj.ID,
1236-
name: gitlabProj.NameWithNamespace,
1175+
id: int64(glProject.ID),
1176+
name: glProject.NameWithNamespace,
1177+
}
1178+
1179+
if glProject.Owner != nil {
1180+
if email := glProject.Owner.Email; email != "" {
1181+
proj.owner = email
1182+
} else {
1183+
proj.owner = glProject.Owner.Username
1184+
}
12371185
}
1238-
if gitlabProj.Owner != nil {
1239-
proj.owner = gitlabProj.Owner.Email
1240-
if proj.owner == "" {
1241-
proj.owner = gitlabProj.Owner.Username
1186+
1187+
repoURL, err := giturl.NormalizeGitlabRepo(glProject.HTTPURLToRepo)
1188+
if err != nil {
1189+
ctx.Logger().Error(err, "failed to normalize GitLab Repo", "repo", glProject.HTTPURLToRepo)
1190+
return
1191+
}
1192+
1193+
ctx.Logger().V(5).Info("cache set: added project metadata in the cache", "cache_key", repoURL)
1194+
s.projectMetadataCache.set(repoURL, proj)
1195+
}
1196+
1197+
func buildIgnorer(include, exclude []string, onCompile func(err error, pattern string)) func(repo string) bool {
1198+
1199+
// compile and load globRepoFilter
1200+
globRepoFilter := newGlobRepoFilter(include, exclude, onCompile)
1201+
1202+
f := func(repo string) bool {
1203+
if !globRepoFilter.includeRepo(repo) || globRepoFilter.ignoreRepo(repo) {
1204+
return true
12421205
}
1206+
return false
12431207
}
1244-
s.repoToProjCache.set(gitlabProj.HTTPURLToRepo, proj)
1208+
1209+
return f
1210+
}
1211+
1212+
// normalizeRepos convert the repo urls from https://gitlab.com/org/repo -> https://gitlab.com/org/repo.git
1213+
func normalizeRepos(repos []string) ([]string, []error) {
1214+
// Optimistically allocate space for all valid repositories.
1215+
validRepos := make([]string, 0, len(repos))
1216+
var errs []error
1217+
for _, prj := range repos {
1218+
repo, err := giturl.NormalizeGitlabRepo(prj)
1219+
if err != nil {
1220+
errs = append(errs, fmt.Errorf("unable to normalize gitlab repo url %q: %w", prj, err))
1221+
continue
1222+
}
1223+
1224+
validRepos = append(validRepos, repo)
1225+
}
1226+
return validRepos, errs
1227+
}
1228+
1229+
// normalizeGitlabEndpoint ensures that if an endpoint is going to gitlab.com, we use https://gitlab.com/ as the endpoint.
1230+
// If we see the protocol is http, we error, because this shouldn't be used.
1231+
// Otherwise, it ensures we are using https as our protocol, if none was provided.
1232+
func normalizeGitlabEndpoint(gitlabEndpoint string) (string, error) {
1233+
if gitlabEndpoint == "" {
1234+
return gitlabCloudBaseUrl, nil
1235+
}
1236+
1237+
gitlabURL, err := url.Parse(gitlabEndpoint)
1238+
if err != nil {
1239+
return "", err
1240+
}
1241+
1242+
// We probably didn't receive a URL with a scheme, which messed up the parsing.
1243+
if gitlabURL.Host == "" {
1244+
gitlabURL, err = url.Parse("https://" + gitlabEndpoint)
1245+
if err != nil {
1246+
return "", err
1247+
}
1248+
}
1249+
1250+
// If the host is gitlab.com, this is the cloud version, which has only one valid endpoint.
1251+
if gitlabURL.Host == "gitlab.com" {
1252+
return gitlabCloudBaseUrl, nil
1253+
}
1254+
1255+
// Beyond here, on-prem gitlab is being used, so we have to mostly leave things as-is.
1256+
1257+
if gitlabURL.Scheme != "https" {
1258+
return "", fmt.Errorf("https was not used as URL scheme, but is required. Please use https")
1259+
}
1260+
1261+
// The gitlab library wants trailing slashes.
1262+
if !strings.HasSuffix(gitlabURL.Path, "/") {
1263+
gitlabURL.Path = gitlabURL.Path + "/"
1264+
}
1265+
1266+
return gitlabURL.String(), nil
12451267
}

0 commit comments

Comments
 (0)