Skip to content

Commit

Permalink
Fix glob performance on large monorepos (#1244)
Browse files Browse the repository at this point in the history
The final patch goes back to the approach we had before #1234, where we'd compose the glob-matching functions instead of keeping a list of ignores, and recompute the matchers when needed.

The patch in #1234 was not optimised, as the matchers were being recreated for every file encountered, and optimising that in the first two commits of this PR got us down to 2x of the performance pre-1234.
That is unfortunately still not acceptable, so I reintroduced the function-composition approach, which is still prone to blowing the stack - the change here should reduce that risk, since instead of composing every line of gitignores as a separate matcher in the chain, we instead nest them in a single or block. That should dramatically reduce the size of the call chain.
  • Loading branch information
f-f authored Jul 5, 2024
1 parent 5634733 commit f130b33
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 43 deletions.
3 changes: 2 additions & 1 deletion src/Spago/Command/Build.purs
Original file line number Diff line number Diff line change
Expand Up @@ -153,8 +153,9 @@ run opts = do
if Array.null pedanticPkgs || opts.depsOnly then
pure true
else do
logInfo $ "Looking for unused and undeclared transitive dependencies..."
logInfo "Looking for unused and undeclared transitive dependencies..."
eitherGraph <- Graph.runGraph globs opts.pursArgs
logDebug "Decoded the output of `purs graph` successfully. Analyzing dependencies..."
eitherGraph # either (prepareToDie >>> (_ $> false)) \graph -> do
env <- ask
checkResults <- map Array.fold $ for pedanticPkgs \(Tuple selected options) -> do
Expand Down
3 changes: 1 addition & 2 deletions src/Spago/Glob.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import mm from 'micromatch';
import * as fsWalk from '@nodelib/fs.walk';

export const testGlob = glob => mm.matcher(glob.include, {ignore: glob.ignore});
export const testGlob = glob => mm.matcher(glob.include, { ignore: glob.ignore });

export const fsWalkImpl = Left => Right => respond => options => path => () => {
const entryFilter = entry => options.entryFilter(entry)();
Expand All @@ -13,4 +13,3 @@ export const fsWalkImpl = Left => Right => respond => options => path => () => {
};

export const isFile = dirent => dirent.isFile();

98 changes: 60 additions & 38 deletions src/Spago/Glob.purs
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,13 @@ import Control.Monad.Maybe.Trans (runMaybeT)
import Control.Monad.Trans.Class (lift)
import Data.Array as Array
import Data.Filterable (filter)
import Data.Foldable (any, fold)
import Data.Foldable (any, traverse_)
import Data.String as String
import Data.Traversable (traverse_)
import Data.String as String.CodePoint
import Effect.Aff as Aff
import Effect.Ref as Ref
import Node.FS.Sync as SyncFS
import Node.Path as Path
import Record as Record
import Type.Proxy (Proxy(..))

type Glob =
{ ignore :: Array String
Expand All @@ -30,8 +28,10 @@ splitGlob { ignore, include } = (\a -> { ignore, include: [ a ] }) <$> include
type Entry = { name :: String, path :: String, dirent :: DirEnt }
type FsWalkOptions = { entryFilter :: Entry -> Effect Boolean, deepFilter :: Entry -> Effect Boolean }

-- https://nodejs.org/api/fs.html#class-fsdirent
foreign import data DirEnt :: Type
foreign import isFile :: DirEnt -> Boolean

foreign import fsWalkImpl
:: (forall a b. a -> Either a b)
-> (forall a b. b -> Either a b)
Expand All @@ -40,32 +40,32 @@ foreign import fsWalkImpl
-> String
-> Effect Unit

gitignoreGlob :: String -> String -> Glob
gitignoreGlob base =
gitignoreFileToGlob :: FilePath -> String -> Glob
gitignoreFileToGlob base =
String.split (String.Pattern "\n")
>>> map String.trim
>>> Array.filter (not <<< or [ String.null, isComment ])
>>> partitionMap
( \line -> do
let
resolve a = Path.concat [ base, a ]
pat a = withForwardSlashes $ resolve $ unpackPattern a
let pattern lin = withForwardSlashes $ Path.concat [ base, gitignorePatternToGlobPattern lin ]
case String.stripPrefix (String.Pattern "!") line of
Just negated -> Left $ pat negated
Nothing -> Right $ pat line
Just negated -> Left $ pattern negated
Nothing -> Right $ pattern line
)
>>> Record.rename (Proxy @"left") (Proxy @"ignore")
>>> Record.rename (Proxy @"right") (Proxy @"include")
>>> (\{ left, right } -> { ignore: left, include: right })

where
isComment = isJust <<< String.stripPrefix (String.Pattern "#")
leadingSlash = String.stripPrefix (String.Pattern "/")
trailingSlash = String.stripSuffix (String.Pattern "/")
dropSuffixSlash str = fromMaybe str $ String.stripSuffix (String.Pattern "/") str
dropPrefixSlash str = fromMaybe str $ String.stripPrefix (String.Pattern "/") str

leadingSlash str = String.codePointAt 0 str == Just (String.CodePoint.codePointFromChar '/')
trailingSlash str = String.codePointAt (String.length str - 1) str == Just (String.CodePoint.codePointFromChar '/')

unpackPattern :: String -> String
unpackPattern pattern
| Just a <- trailingSlash pattern = unpackPattern a
| Just a <- leadingSlash pattern = a <> "/**"
gitignorePatternToGlobPattern :: String -> String
gitignorePatternToGlobPattern pattern
| trailingSlash pattern = gitignorePatternToGlobPattern $ dropSuffixSlash pattern
| leadingSlash pattern = dropPrefixSlash pattern <> "/**"
| otherwise = "**/" <> pattern <> "/**"

fsWalk :: String -> Array String -> Array String -> Aff (Array Entry)
Expand All @@ -74,41 +74,63 @@ fsWalk cwd ignorePatterns includePatterns = Aff.makeAff \cb -> do

-- Pattern for directories which can be outright ignored.
-- This will be updated whenver a .gitignore is found.
ignoreMatcherRef :: Ref Glob <- Ref.new { ignore: [], include: ignorePatterns }
ignoreMatcherRef :: Ref (String -> Boolean) <- Ref.new (testGlob { ignore: [], include: ignorePatterns })

-- If this Ref contains `true` because this Aff has been canceled, then deepFilter will always return false.
canceled <- Ref.new false

let
entryGitignore :: Entry -> Effect Unit
entryGitignore entry =
try (SyncFS.readTextFile UTF8 entry.path)
>>= traverse_ \gitignore ->
let
base = Path.relative cwd $ Path.dirname entry.path
glob = gitignoreGlob base gitignore
pats = splitGlob glob
patOk g = not $ any (testGlob g) includePatterns
newPats = filter patOk pats
in
void $ Ref.modify (_ <> fold newPats) $ ignoreMatcherRef
-- Update the ignoreMatcherRef with the patterns from a .gitignore file
updateIgnoreMatcherWithGitignore :: Entry -> Effect Unit
updateIgnoreMatcherWithGitignore entry = do
let
gitignorePath = entry.path
-- directory of this .gitignore relative to the directory being globbed
base = Path.relative cwd (Path.dirname gitignorePath)

try (SyncFS.readTextFile UTF8 entry.path) >>= traverse_ \gitignore -> do
let
gitignored = testGlob <$> (splitGlob $ gitignoreFileToGlob base gitignore)

-- Do not add `.gitignore` patterns that explicitly ignore the files
-- we're searching for;
--
-- ex. if `includePatterns` is [".spago/p/aff-1.0.0/**/*.purs"],
-- and `gitignored` is ["node_modules", ".spago"],
-- then add "node_modules" to `ignoreMatcher` but not ".spago"
wouldConflictWithSearch matcher = any matcher includePatterns

newMatchers = or $ filter (not <<< wouldConflictWithSearch) gitignored

-- Another possible approach could be to keep a growing array of patterns and
-- regenerate the matcher on every gitignore. We have tried that (see #1234),
-- and turned out to be 2x slower. (see #1242, and #1244)
-- Composing functions is faster, but there's the risk of blowing the stack
-- (see #1231) - when this was introduced in #1210, every match from the
-- gitignore file would be `or`ed to the previous matcher, which would create
-- a very long (linear) call chain - in this latest iteration we are `or`ing the
-- new matchers together, then the whole thing with the previous matcher.
-- This is still prone to stack issues, but we now have a tree so it should
-- not be as dramatic.
addMatcher currentMatcher = or [ currentMatcher, newMatchers ]

Ref.modify_ addMatcher ignoreMatcherRef

-- Should `fsWalk` recurse into this directory?
deepFilter :: Entry -> Effect Boolean
deepFilter entry = fromMaybe false <$> runMaybeT do
isCanceled <- lift $ Ref.read canceled
guard $ not isCanceled
shouldIgnore <- lift $ testGlob <$> Ref.read ignoreMatcherRef
shouldIgnore <- lift $ Ref.read ignoreMatcherRef
pure $ not $ shouldIgnore $ Path.relative cwd entry.path

-- Should `fsWalk` retain this entry for the result array?
entryFilter :: Entry -> Effect Boolean
entryFilter entry = do
when (isFile entry.dirent && entry.name == ".gitignore") (entryGitignore entry)
ignorePat <- Ref.read ignoreMatcherRef
let
ignoreMatcher = testGlob ignorePat
path = withForwardSlashes $ Path.relative cwd entry.path
when (isFile entry.dirent && entry.name == ".gitignore") do
updateIgnoreMatcherWithGitignore entry
ignoreMatcher <- Ref.read ignoreMatcherRef
let path = withForwardSlashes $ Path.relative cwd entry.path
pure $ includeMatcher path && not (ignoreMatcher path)

options = { entryFilter, deepFilter }
Expand Down
8 changes: 6 additions & 2 deletions test/Spago/Glob.purs
Original file line number Diff line number Diff line change
Expand Up @@ -92,10 +92,14 @@ spec = Spec.around globTmpDir do

Spec.it "is stacksafe" \p -> do
let
chars = [ "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k" ]
-- 10,000-line gitignore
chars = [ "a", "b", "c", "d", "e", "f", "g", "h" ]
-- 4000-line gitignore
words = [ \a b c d -> a <> b <> c <> d ] <*> chars <*> chars <*> chars <*> chars
hugeGitignore = intercalate "\n" words
-- Write it in a few places
FS.writeTextFile (Path.concat [ p, ".gitignore" ]) hugeGitignore
FS.writeTextFile (Path.concat [ p, "fruits", ".gitignore" ]) hugeGitignore
FS.writeTextFile (Path.concat [ p, "fruits", "left", ".gitignore" ]) hugeGitignore
FS.writeTextFile (Path.concat [ p, "fruits", "right", ".gitignore" ]) hugeGitignore
a <- Glob.gitignoringGlob p [ "fruits/**/apple" ]
Array.sort a `Assert.shouldEqual` [ "fruits/left/apple", "fruits/right/apple" ]

0 comments on commit f130b33

Please sign in to comment.