Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
98519fa
enabled symlinks with maximum depth support
MuneebUllahKhan222 Feb 12, 2026
6b7b65f
resolved linter issues
MuneebUllahKhan222 Feb 12, 2026
80a5b68
resolved bugbot comms
MuneebUllahKhan222 Feb 12, 2026
705e559
resolved bugbot comment
MuneebUllahKhan222 Feb 12, 2026
04390ba
resolved concurrency bugs and added maxDepthOption to cli
MuneebUllahKhan222 Feb 13, 2026
29cd13d
Removed visited path map and tracked symlink depth by maintaining a c…
MuneebUllahKhan222 Feb 13, 2026
9e22011
reolved bugbot comm
MuneebUllahKhan222 Feb 13, 2026
f9ebb61
fixed resumption issue by introducing new resumption method
MuneebUllahKhan222 Feb 13, 2026
1317eaa
used the new method
MuneebUllahKhan222 Feb 13, 2026
d739515
separated symlink scanning from scanDir
MuneebUllahKhan222 Feb 19, 2026
efd4daf
resolved bugbot comments
MuneebUllahKhan222 Feb 19, 2026
f22e88b
fixed resumption
MuneebUllahKhan222 Feb 19, 2026
e356f28
Merge branch 'main' into enable-symlinks
MuneebUllahKhan222 Feb 19, 2026
1a5b175
fixed protos issue
MuneebUllahKhan222 Feb 19, 2026
ed9b99d
fixed depth logic
MuneebUllahKhan222 Feb 19, 2026
bc95e59
resolved bugbot issue
MuneebUllahKhan222 Feb 19, 2026
eaca505
resolved depth comments
MuneebUllahKhan222 Feb 19, 2026
8e5df38
changed resumtion logic
MuneebUllahKhan222 Feb 19, 2026
dfba81e
Merge branch 'main' into enable-symlinks
MuneebUllahKhan222 Feb 19, 2026
3a37258
updated func calls in test according to new func signatures
MuneebUllahKhan222 Feb 19, 2026
e0030c2
added regular check handling
MuneebUllahKhan222 Feb 19, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pkg/engine/filesystem.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ func (e *Engine) ScanFileSystem(ctx context.Context, c sources.FilesystemConfig)
Paths: c.Paths,
IncludePathsFile: c.IncludePathsFile,
ExcludePathsFile: c.ExcludePathsFile,
MaxSymlinkDepth: c.MaxSymlinkDepth,
}
var conn anypb.Any
err := anypb.MarshalFrom(&conn, connection, proto.MarshalOptions{})
Expand Down
1,355 changes: 683 additions & 672 deletions pkg/pb/sourcespb/sources.pb.go

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions pkg/pb/sourcespb/sources.pb.validate.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

156 changes: 147 additions & 9 deletions pkg/sources/filesystem/filesystem.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"io/fs"
"os"
"path/filepath"
"sync"

"github.com/go-errors/errors"
"github.com/go-logr/logr"
Expand Down Expand Up @@ -37,13 +38,20 @@ type Source struct {
skipBinaries bool
sources.Progress
sources.CommonSourceUnitUnmarshaller
followSymlinks bool
maxSymlinkDepth int
visitedSymlinkPaths map[string]struct{}
visitedmu sync.Mutex
}

// Ensure the Source satisfies the interfaces at compile time
var _ sources.Source = (*Source)(nil)
var _ sources.SourceUnitUnmarshaller = (*Source)(nil)
var _ sources.SourceUnitEnumChunker = (*Source)(nil)

// max symlink depth allowed
const DEFAULT_MAX_SYMLINK_DEPTH_ALLOWED = 40
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

issue: Annoyingly, Go strongly discourages constants like this -- it should be DefaultMaxSymlinkDepthAllowed


// Type returns the type of source.
// It is used for matching source types in configuration and job input.
func (s *Source) Type() sourcespb.SourceType {
Expand Down Expand Up @@ -80,10 +88,48 @@ func (s *Source) Init(aCtx context.Context, name string, jobId sources.JobID, so
return fmt.Errorf("unable to create filter: %w", err)
}
s.filter = filter
s.maxSymlinkDepth = int(conn.GetMaxSymlinkDepth())
if s.maxSymlinkDepth > DEFAULT_MAX_SYMLINK_DEPTH_ALLOWED {
err := fmt.Errorf("Specified symlink depth %v exceeds the allowed max allowed symlink depth of %v", s.maxSymlinkDepth, DEFAULT_MAX_SYMLINK_DEPTH_ALLOWED)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

issue: Line too loooooooong! gofmt won't break long lines and is a curse to us all, but it will let you break them up yourself like:

		err := fmt.Errorf(
      "Specified symlink depth %v exceeds the allowed max allowed symlink depth of %v",
      s.maxSymlinkDepth,
      DEFAULT_MAX_SYMLINK_DEPTH_ALLOWED,
    )

I'd also consider using %d for the special interpolation characters--best to be as specific as we can.

return err
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion: Move this check above setting s.maxSymlinkDepth. It doesn't matter at all in this case, but it's a good habit to check a value's range before using it--otherwise if someone comes by later and makes this check non-fatal, you won't have an out-of-range value in the Source.

if s.maxSymlinkDepth > 0 {
s.followSymlinks = true
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion: Oh I see. I like wrapping these kinds of things in methods like:

func (s *Source) followSymlinks() bool {
  return s.maxSymlinkDepth > 0
}

It codifies the definition in a nice way and doesn't create weirdo struct states--like there's a brief period in time where s.maxSymlinkDepth can be > 0, but s.followSymlinks is false. Again in this case it isn't a big deal, but anyone coming by later and refactoring will have to keep this order in mind. Pulling it out into a method removes the need to order an assignment entirely.

}
Comment on lines +94 to +101
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Validation ⁉️ Incredible haha 🏆

s.visitedSymlinkPaths = make(map[string]struct{})

return nil
}

func (s *Source) resolveSymLink(ctx context.Context, symlinkPath string) (os.FileInfo, string, error) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion: Reading this, it made me think this should be scanSymlink in the same vein as like scanDir and scanFile.

func (s *Source) resolveSymLink(
  ctx context.Context,
  path string,
  chunksChan chan *sources.Chunk,
  workerPool *errgroup.Group,
  depth int,
  rootPath string,
) error {
  depth++
  if depth >= s.maxSymlinkDepth {
    return errors.New("max symlink depth reached")
  }

  path = filepath.Clean(path)

  res, err := os.ReadLink(path)
  if err != nil {
    return fmt.Errorf("readlink error: %w", err)
  }

  if !filepath.IsAbs(res) {
    res = filepath.Join(filepath.Dir(path), res)
  }

  fileInfo, err = os.Lstat(resolvedFilePath)
  if err != nil {
    return fmt.Errorf("lstat error: %w", err)
  }

  if fileInfo.Mode()&os.ModeSymlink == 0 {
    ctx.Logger().V(5).Info("found symlink to symlink",
      "symlinkPath", path,
      "resolvedPath", res,
      "depth", depth+1,
    )

    return scanSymlink(ctx, res, chunksChan, workerPool, depth, rootPath)
  }

  if fileInfo.IsDir() {
    ctx.Logger().V(5).Info("found symlink to dir",
      "symlinkPath", path,
      "resolvedPath", res,
      "depth", depth+1,
    )

    return s.scanDir(ctx, res, chunksChan, workerPool, depth, rootPath)
  }

  ctx.Logger().V(5).Info("found symlink to file",
    "symlinkPath", path,
    "resolvedPath", res,
    "depth", depth+1,
  )

  return s.scanFile(ctx, res, chunksChan)
}

It does mean that scanDir has to pass depth into this function, but I think it's a little more consistent.

But the main virtue is that we prevent users from slipping past the symlink depth limit. Via scanDir, we could be 4-5 levels deep before we find a symlink, but this method doesn't know that, so we might be accidentally going too deep.

var fileInfo os.FileInfo
var err error
resolvedPath := symlinkPath
var resolvedFilePath string
for depth := 0; depth < s.maxSymlinkDepth; depth++ {
resolvedFilePath, err = os.Readlink(resolvedPath)
if err != nil {
return nil, "", fmt.Errorf("Error in resolving symlink: %v", err)
}
if !filepath.IsAbs(resolvedFilePath) {
resolvedFilePath = filepath.Join(
filepath.Dir(resolvedPath),
resolvedFilePath,
)
}
fileInfo, err = os.Lstat(resolvedFilePath)
if err != nil {
return nil, "", fmt.Errorf("Error in retrieving info: %v", err)
}
if fileInfo.Mode()&os.ModeSymlink == 0 {
ctx.Logger().V(5).Info("Symlink has been resolved", "symlinkPath", symlinkPath, "resolvedPath", resolvedFilePath, "depth", depth+1)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

issue: This line is too long too, but specifically we (mostly) have a pattern of:

ctx.Logger().V(5).Info("Symlink has been resolved",
  "symlinkPath", symlinkPath,
  "resolvedPath", resolvedFilePath,
  "depth", depth+1,
)

This way the names are paired up with the value nicely.

return fileInfo, filepath.Clean(resolvedFilePath), nil
}
resolvedPath = filepath.Clean(resolvedFilePath)
}
return nil, "", fmt.Errorf("Unable to resolve symlink %s for the specified depth %d", symlinkPath, s.maxSymlinkDepth)
}

// Chunks emits chunks of bytes over a channel.
func (s *Source) Chunks(ctx context.Context, chunksChan chan *sources.Chunk, _ ...sources.ChunkingTarget) error {
for i, path := range s.paths {
Expand All @@ -100,15 +146,29 @@ func (s *Source) Chunks(ctx context.Context, chunksChan chan *sources.Chunk, _ .
continue
}

// This will always be the FileInfo we use to decide dir vs file
targetInfo := fileInfo
targetInfoPath := cleanPath

if fileInfo.Mode()&os.ModeSymlink != 0 {
logger.Info("skipping, not a regular file", "path", cleanPath)
continue
if !s.followSymlinks {
// If the file or directory is a symlink but the followSymlinks is disable ignore the path
logger.Info("skipping, following symlinks is not allowed", "path", cleanPath)
continue
}
// if the root path is a symlink resolve the path and check if it resolves to a dir or file
// if targetInfo is a dir then call scanDir otherwise call scanFile
targetInfo, targetInfoPath, err = s.resolveSymLink(ctx, path)
if err != nil {
logger.Error(err, err.Error())
continue
}
}

if fileInfo.IsDir() {
err = s.scanDir(ctx, cleanPath, chunksChan)
if targetInfo.IsDir() {
err = s.scanDir(ctx, targetInfoPath, chunksChan)
} else {
err = s.scanFile(ctx, cleanPath, chunksChan)
err = s.scanFile(ctx, targetInfoPath, chunksChan)
}

if err != nil && !errors.Is(err, io.EOF) {
Expand All @@ -120,6 +180,19 @@ func (s *Source) Chunks(ctx context.Context, chunksChan chan *sources.Chunk, _ .

return nil
}

func (s *Source) checkAndMarkVistiedSymlink(resolvedpath string) bool {
cleanedPath := filepath.Clean(resolvedpath)
s.visitedmu.Lock()
defer s.visitedmu.Unlock()

if _, seen := s.visitedSymlinkPaths[cleanedPath]; seen {
return true
}
s.visitedSymlinkPaths[cleanedPath] = struct{}{}
return false
}

func (s *Source) scanDir(ctx context.Context, path string, chunksChan chan *sources.Chunk) error {
workerPool := new(errgroup.Group)
workerPool.SetLimit(s.concurrency)
Expand All @@ -129,14 +202,14 @@ func (s *Source) scanDir(ctx context.Context, path string, chunksChan chan *sour
}()
startState := s.GetEncodedResumeInfoFor(path)
resuming := startState != ""

return fs.WalkDir(os.DirFS(path), ".", func(relativePath string, d fs.DirEntry, err error) error {
if err != nil {
ctx.Logger().Error(err, "error walking directory")
return nil
}

fullPath := filepath.Join(path, relativePath)
ctx.Logger().V(5).Info("Full path found is", "fullPath", fullPath)

// check if the full path is not matching any pattern in include FilterRuleSet and matching any exclude FilterRuleSet.
if s.filter != nil && !s.filter.Pass(fullPath) {
Expand All @@ -148,6 +221,54 @@ func (s *Source) scanDir(ctx context.Context, path string, chunksChan chan *sour
return nil // skip the file
}

if d.Type()&os.ModeSymlink != 0 {
if s.followSymlinks {
ctx.Logger().V(5).Info("Directory/File found is a symlink", "path", path)
// if the found directory or file is symlink resolve the symlink
resolved, resolvedPath, err := s.resolveSymLink(ctx, fullPath)
if err != nil {
ctx.Logger().Error(err, err.Error(), "path", fullPath)
return nil
}
// If symlink resolves to a file then scan it
if !resolved.IsDir() {
ctx.Logger().V(5).Info("Resolved symlink is a file", "path", resolvedPath)
if resuming {
// Since we store the resolved file path in encodeResumeInfo
// so we match the resolved with startState
if resolvedPath == startState {
resuming = false
}
return nil
}
workerPool.Go(func() error {
if err := s.scanFile(ctx, resolvedPath, chunksChan); err != nil {
ctx.Logger().Error(err, "error scanning file", "path", resolvedPath)
}
s.SetEncodedResumeInfoFor(path, resolvedPath)
return nil
})
return nil
}
// This check is to avoid loopy scenarios like A/linkToB->B and B/linkToA->A
// We are going to maintain the map of symlinks resolved to a directory
// as this case won't happen in symlinks resolved to file
if s.checkAndMarkVistiedSymlink(resolvedPath) {
ctx.Logger().V(5).Info("Resolved path is already visited", "path", resolvedPath)
return nil
}
// If symlink resolves to directory scan that directory
ctx.Logger().V(5).Info("Resolved symlink is a directory", "path", resolvedPath)
err = s.scanDir(ctx, resolvedPath, chunksChan)
if err != nil {
ctx.Logger().Error(err, "error occurred in nested recursive scanDir call")
}
return nil
}
// Skip symlinks if followSymlinks is false
return nil
}

// Skip over non-regular files. We do this check here to suppress noisy
// logs for trying to scan directories and other non-regular files in
// our traversal.
Expand Down Expand Up @@ -251,18 +372,35 @@ func (s *Source) ChunkUnit(ctx context.Context, unit sources.SourceUnit, reporte
if err != nil {
return reporter.ChunkErr(ctx, fmt.Errorf("unable to get file info: %w", err))
}
// This will always be the FileInfo we use to decide dir vs file
targetInfo := fileInfo
targetInfoPath := cleanPath
if fileInfo.Mode()&os.ModeSymlink != 0 {
if !s.followSymlinks {
// If the file or directory is a symlink but the followSymlinks is disable ignore the path
logger.Info("skipping, following symlinks is not enabled", "path", cleanPath)
return nil
}
// if the root path is a symlink resolve the path and check if it resolves to a dir or file
// if resolvedSymlinkInfo is a dir then call scanDir otherwise call scanFile
targetInfo, targetInfoPath, err = s.resolveSymLink(ctx, cleanPath)
if err != nil {
logger.Error(err, "unable to get symlink info")
return nil
}
}

ch := make(chan *sources.Chunk)
var scanErr error
go func() {
defer close(ch)
if fileInfo.IsDir() {
if targetInfo.IsDir() {
// TODO: Finer grain error tracking of individual chunks.
scanErr = s.scanDir(ctx, cleanPath, ch)
scanErr = s.scanDir(ctx, targetInfoPath, ch)
} else {
// TODO: Finer grain error tracking of individual
// chunks (in the case of archives).
scanErr = s.scanFile(ctx, cleanPath, ch)
scanErr = s.scanFile(ctx, targetInfoPath, ch)
}
}()

Expand Down
Loading
Loading