refactor: WIP. Module name to filepath optimisation

guibou · guibou · commit ab06e5138156 · 2025-05-26T17:49:34.000+04:00
This is related to #4598. This changes the file to module associating logic done during dependency graph building. Before, each time a module `Foo.Bar` is found, HLS is testing inside all the import path for the existence of a relevant fiel.. It means that for `i` import paths and `m` modules to locate, `m * n` filesystem operations are done. Note also that this involves a lot of complex string concatenation primitive to build the `FilePath`. A module is tested for each `import` for each of the file of the project. We also test for `boot` files, doubling the number of test. In #4598 we have a project with `1100` modules, in more than 250 import paths and we count more than `17000` `import` statments, resulting on over 6 millions test for file existences. This project was blocking for more than 3 minutes during HLS startup. This commit changes the way this is computed: - At startup, a `Map ModuleName FilePath` (the real type is a bit more involved for performance, multiples unit and boot files handling) is built by scanning all the import paths for files representing the different modules. - Directory scanning is efficient and if import path only contains haskell module, this will never do more job that listing the files of the project. - The lookup is now simplify a `Map` lookup. The performance improvement is as follows: - The number of IO operation is dramatically reduced, from multiples millions to a few recursive directories listing. - A lot of the boilerplate of converting path had be removed. - TODO: add an RTS stats before / after with number of allocations - On my project, the graph building time is reduced from a few minutes to 3s. Limitations: - How to rebuild the `Map` if the content of one directory change? - If one directory is filled with millions of files which are not of interested, performance can be damaged. TODO: add a diagnostic during this phase so the user can learn about this issue. Code status: - The `lookup` is not fully restored, especially it does not include the handling of home unit as well as reexport. - The initialisation phase is cached inside a `TVar` stored as a top level identifier using `unsafePerformIO`. This is to be improved.
diff --git a/ghcide/ghcide.cabal b/ghcide/ghcide.cabal
@@ -107,6 +107,7 @@ library
     , unliftio-core
     , unordered-containers         >=0.2.10.0
     , vector
+    , pretty-simple
 
   if os(windows)
     build-depends: Win32
diff --git a/ghcide/src/Development/IDE/Core/RuleTypes.hs b/ghcide/src/Development/IDE/Core/RuleTypes.hs
@@ -392,6 +392,9 @@ type instance RuleResult GetModSummary = ModSummaryResult
 -- | Generate a ModSummary with the timestamps and preprocessed content elided, for more successful early cutoff
 type instance RuleResult GetModSummaryWithoutTimestamps = ModSummaryResult
 
+type instance RuleResult GetModulesPaths = (M.Map ModuleName (UnitId, NormalizedFilePath),
+                                            M.Map ModuleName (UnitId, NormalizedFilePath))
+
 data GetParsedModule = GetParsedModule
     deriving (Eq, Show, Generic)
 instance Hashable GetParsedModule
@@ -494,6 +497,13 @@ data GetModSummaryWithoutTimestamps = GetModSummaryWithoutTimestamps
 instance Hashable GetModSummaryWithoutTimestamps
 instance NFData   GetModSummaryWithoutTimestamps
 
+-- | Scan all the import directory for existing modules and build a map from
+-- module name to paths
+data GetModulesPaths = GetModulesPaths
+    deriving (Eq, Show, Generic)
+instance Hashable GetModulesPaths
+instance NFData   GetModulesPaths
+
 data GetModSummary = GetModSummary
     deriving (Eq, Show, Generic)
 instance Hashable GetModSummary
diff --git a/ghcide/src/Development/IDE/Core/Rules.hs b/ghcide/src/Development/IDE/Core/Rules.hs
@@ -4,6 +4,7 @@
 {-# LANGUAGE CPP                   #-}
 {-# LANGUAGE DuplicateRecordFields #-}
 {-# LANGUAGE TypeFamilies          #-}
+{-# LANGUAGE PartialTypeSignatures #-}
 
 -- | A Shake implementation of the compiler service, built
 --   using the "Shaker" abstraction layer for in-memory use.
@@ -93,7 +94,7 @@ import           Data.Proxy
 import qualified Data.Text                                    as T
 import qualified Data.Text.Encoding                           as T
 import qualified Data.Text.Utf16.Rope.Mixed                   as Rope
-import           Data.Time                                    (UTCTime (..))
+import           Data.Time                                    (UTCTime (..), getCurrentTime, diffUTCTime)
 import           Data.Time.Clock.POSIX                        (posixSecondsToUTCTime)
 import           Data.Tuple.Extra
 import           Data.Typeable                                (cast)
@@ -173,6 +174,12 @@ import           System.Info.Extra                            (isWindows)
 
 import qualified Data.IntMap                                  as IM
 import           GHC.Fingerprint
+import Text.Pretty.Simple
+import qualified Data.Map.Strict as Map
+import System.FilePath (takeExtension, takeFileName, normalise, dropTrailingPathSeparator, dropExtension, splitDirectories)
+import Data.Char (isUpper)
+import System.Directory.Extra (listFilesRecursive, listFilesInside)
+import System.IO.Unsafe
 
 data Log
   = LogShake Shake.Log
@@ -311,6 +318,7 @@ getParsedModuleDefinition packageState opt file ms = do
 getLocatedImportsRule :: Recorder (WithPriority Log) -> Rules ()
 getLocatedImportsRule recorder =
     define (cmapWithPrio LogShake recorder) $ \GetLocatedImports file -> do
+
         ModSummaryResult{msrModSummary = ms} <- use_ GetModSummaryWithoutTimestamps file
         (KnownTargets targets targetsMap) <- useNoFile_ GetKnownTargets
         let imports = [(False, imp) | imp <- ms_textual_imps ms] ++ [(True, imp) | imp <- ms_srcimps ms]
@@ -333,8 +341,11 @@ getLocatedImportsRule recorder =
                 | otherwise = do
                     itExists <- getFileExists nfp
                     return $ if itExists then Just nfp else Nothing
+
+        moduleMaps <- use_ GetModulesPaths file
         (diags, imports') <- fmap unzip $ forM imports $ \(isSource, (mbPkgName, modName)) -> do
-            diagOrImp <- locateModule (hscSetFlags dflags env) import_dirs (optExtensions opt) getTargetFor modName mbPkgName isSource
+
+            diagOrImp <- locateModule moduleMaps (hscSetFlags dflags env) import_dirs (optExtensions opt) getTargetFor modName mbPkgName isSource
             case diagOrImp of
                 Left diags              -> pure (diags, Just (modName, Nothing))
                 Right (FileImport path) -> pure ([], Just (modName, Just path))
@@ -624,6 +635,43 @@ getModuleGraphRule recorder = defineEarlyCutOffNoFile (cmapWithPrio LogShake rec
   fs <- toKnownFiles <$> useNoFile_ GetKnownTargets
   dependencyInfoForFiles (HashSet.toList fs)
 
+{-# NOINLINE cacheVar #-}
+cacheVar = unsafePerformIO (newTVarIO mempty)
+
+getModulesPathsRule :: Recorder (WithPriority Log) -> Rules ()
+getModulesPathsRule recorder = defineEarlyCutoff (cmapWithPrio LogShake recorder) $ Rule $ \GetModulesPaths file -> do
+  env_eq <- use_ GhcSession file
+
+  cache <- liftIO (readTVarIO cacheVar)
+  case Map.lookup (envUnique env_eq) cache of
+    Just res -> pure (mempty, ([], Just res))
+    Nothing -> do
+      let env = hscEnv env_eq
+      let import_dirs = map (second homeUnitEnv_dflags) $ hugElts $ hsc_HUG env
+      opt <- getIdeOptions
+      let exts = (optExtensions opt)
+      let acceptedExtensions = concatMap (\x -> ['.':x, '.':x <> "-boot"]) exts
+
+      (unzip -> (a, b)) <- flip mapM import_dirs $ \(u, dyn) -> do
+        (unzip -> (a, b)) <- flip mapM (importPaths dyn) $ \dir' -> do
+          let dir = dropTrailingPathSeparator dir'
+          let predicate path = pure (path == dir || isUpper (head (takeFileName path)))
+          let dir_number_directories = length (splitDirectories dir)
+          let toModule file = mkModuleName (intercalate "." $ drop dir_number_directories (splitDirectories (dropExtension file)))
+
+          -- TODO: we are taking/droping extension, this could be factorized to save a few cpu cycles ;)
+          -- TODO: do acceptedextensions needs to be a set ? or a vector?
+          modules <- fmap (\path -> (toModule path, toNormalizedFilePath' path)) . filter (\y -> takeExtension y `elem` acceptedExtensions) <$> liftIO (listFilesInside predicate dir)
+          let isSourceModule (_, path) = "-boot" `isSuffixOf` fromNormalizedFilePath path
+          let (sourceModules, notSourceModules) = partition isSourceModule modules
+          pure $ (Map.fromList notSourceModules, Map.fromList sourceModules)
+        pure (fmap (u,) $ mconcat a, fmap (u, ) $ mconcat b)
+
+      let res = (mconcat a, mconcat b)
+      liftIO $ atomically $ modifyTVar' cacheVar (Map.insert (envUnique env_eq) res)
+
+      pure (mempty, ([], Just $ (mconcat a, mconcat b)))
+
 getModuleGraphSingleFileRule :: Recorder (WithPriority Log) -> Rules ()
 getModuleGraphSingleFileRule recorder =
     defineEarlyCutoff (cmapWithPrio LogShake recorder) $ Rule $ \GetFileModuleGraph file -> do
@@ -632,8 +680,12 @@ getModuleGraphSingleFileRule recorder =
 
 dependencyInfoForFiles :: [NormalizedFilePath] -> Action (BS.ByteString, DependencyInformation)
 dependencyInfoForFiles fs = do
+  -- liftIO $ print ("fs length", length fs)
   (rawDepInfo, bm) <- rawDependencyInformation fs
+  -- liftIO $ print ("ok with raw deps")
+  -- liftIO $ pPrint rawDepInfo
   let (all_fs, _all_ids) = unzip $ HM.toList $ pathToIdMap $ rawPathIdMap rawDepInfo
+  -- liftIO $ print ("all_fs length", length all_fs)
   msrs <- uses GetModSummaryWithoutTimestamps all_fs
   let mss = map (fmap msrModSummary) msrs
   let deps = map (\i -> IM.lookup (getFilePathId i) (rawImports rawDepInfo)) _all_ids
@@ -1232,6 +1284,7 @@ mainRule recorder RulesConfig{..} = do
     getModIfaceRule recorder
     getModSummaryRule templateHaskellWarning recorder
     getModuleGraphRule recorder
+    getModulesPathsRule recorder
     getModuleGraphSingleFileRule recorder
     getFileHashRule recorder
     knownFilesRule recorder
diff --git a/ghcide/src/Development/IDE/Import/FindImports.hs b/ghcide/src/Development/IDE/Import/FindImports.hs
@@ -5,7 +5,6 @@
 
 module Development.IDE.Import.FindImports
   ( locateModule
-  , locateModuleFile
   , Import(..)
   , ArtifactsLocation(..)
   , modSummaryToArtifactsLocation
@@ -14,9 +13,8 @@ module Development.IDE.Import.FindImports
   ) where
 
 import           Control.DeepSeq
-import           Control.Monad.Extra
 import           Control.Monad.IO.Class
-import           Data.List                         (find, isSuffixOf)
+import           Data.List                         (isSuffixOf)
 import           Data.Maybe
 import qualified Data.Set                          as S
 import           Development.IDE.GHC.Compat        as Compat
@@ -26,7 +24,8 @@ import           Development.IDE.Types.Diagnostics
 import           Development.IDE.Types.Location
 import           GHC.Types.PkgQual
 import           GHC.Unit.State
-import           System.FilePath
+import Data.Map.Strict (Map)
+import qualified Data.Map.Strict as Map
 
 
 #if MIN_VERSION_ghc(9,11,0)
@@ -70,6 +69,7 @@ data LocateResult
   | LocateFoundReexport UnitId
   | LocateFoundFile UnitId NormalizedFilePath
 
+{-
 -- | locate a module in the file system. Where we go from *daml to Haskell
 locateModuleFile :: MonadIO m
              => [(UnitId, [FilePath], S.Set ModuleName)]
@@ -94,6 +94,7 @@ locateModuleFile import_dirss exts targetFor isSource modName = do
     maybeBoot ext
       | isSource = ext ++ "-boot"
       | otherwise = ext
+-}
 
 -- | This function is used to map a package name to a set of import paths.
 -- It only returns Just for unit-ids which are possible to import into the
@@ -110,36 +111,47 @@ mkImportDirs _env (i, flags) = Just (i, (importPaths flags, reexportedModules fl
 -- Haskell
 locateModule
     :: MonadIO m
-    => HscEnv
+    => (Map ModuleName (UnitId, NormalizedFilePath),Map ModuleName (UnitId, NormalizedFilePath))
+ -> HscEnv
     -> [(UnitId, DynFlags)] -- ^ Import directories
     -> [String]                        -- ^ File extensions
     -> (ModuleName -> NormalizedFilePath -> m (Maybe NormalizedFilePath))  -- ^ does file exist predicate
     -> Located ModuleName              -- ^ Module name
     -> PkgQual                -- ^ Package name
     -> Bool                            -- ^ Is boot module
     -> m (Either [FileDiagnostic] Import)
-locateModule env comp_info exts targetFor modName mbPkgName isSource = do
+locateModule moduleMaps@(moduleMap, moduleMapSource) env comp_info exts targetFor modName mbPkgName isSource = do
   case mbPkgName of
     -- 'ThisPkg' just means some home module, not the current unit
     ThisPkg uid
+      -- TODO: there are MANY lookup on import_paths, which is a problem considering that it can be large.
       | Just (dirs, reexports) <- lookup uid import_paths
-          -> lookupLocal uid dirs reexports
+          -> lookupLocal moduleMaps uid dirs reexports
       | otherwise -> return $ Left $ notFoundErr env modName $ LookupNotFound []
     -- if a package name is given we only go look for a package
     OtherPkg uid
       | Just (dirs, reexports) <- lookup uid import_paths
-          -> lookupLocal uid dirs reexports
+          -> lookupLocal moduleMaps uid dirs reexports
       | otherwise -> lookupInPackageDB
     NoPkgQual -> do
 
       -- Reexports for current unit have to be empty because they only apply to other units depending on the
       -- current unit. If we set the reexports to be the actual reexports then we risk looping forever trying
       -- to find the module from the perspective of the current unit.
-      mbFile <- locateModuleFile ((homeUnitId_ dflags, importPaths dflags, S.empty) : other_imports) exts targetFor isSource $ unLoc modName
+      ---- locateModuleFile ((homeUnitId_ dflags, importPaths dflags, S.empty) : other_imports) exts targetFor isSource $ unLoc modName
+      --
+      -- TODO: handle the other imports, the unit id, ..., reexport.
+      --   - Previous implementation was using homeUnitId dflags
+      --   - Handle the -boot
+      --   - Have a look at "targetFor"
+      --
+      let mbFile = case Map.lookup (unLoc modName) (if isSource then moduleMapSource else moduleMap) of
+                     Nothing -> LocateNotFound
+                     Just (uid, file) -> LocateFoundFile uid file
       case mbFile of
         LocateNotFound -> lookupInPackageDB
         -- Lookup again with the perspective of the unit reexporting the file
-        LocateFoundReexport uid -> locateModule (hscSetActiveUnitId uid env) comp_info exts targetFor modName noPkgQual isSource
+        LocateFoundReexport uid -> locateModule moduleMaps (hscSetActiveUnitId uid env) comp_info exts targetFor modName noPkgQual isSource
         LocateFoundFile uid file -> toModLocation uid file
   where
     dflags = hsc_dflags env
@@ -180,12 +192,16 @@ locateModule env comp_info exts targetFor modName mbPkgName isSource = do
         let genMod = mkModule (RealUnit $ Definite uid) (unLoc modName)  -- TODO support backpack holes
         return $ Right $ FileImport $ ArtifactsLocation file (Just loc) (not isSource) (Just genMod)
 
-    lookupLocal uid dirs reexports = do
-      mbFile <- locateModuleFile [(uid, dirs, reexports)] exts targetFor isSource $ unLoc modName
+    lookupLocal moduleMaps@(moduleMapSource, moduleMap) uid dirs reexports = do
+      error "MOXOOO"
+      -- mbFile <- locateModuleFile [(uid, dirs, reexports)] exts targetFor isSource $ unLoc modName
+      let mbFile = case Map.lookup (unLoc modName) (if isSource then moduleMapSource else moduleMap) of
+                     Nothing -> LocateNotFound
+                     Just (uid, file) -> LocateFoundFile uid file
       case mbFile of
         LocateNotFound -> return $ Left $ notFoundErr env modName $ LookupNotFound []
         -- Lookup again with the perspective of the unit reexporting the file
-        LocateFoundReexport uid' -> locateModule (hscSetActiveUnitId uid' env) comp_info exts targetFor modName noPkgQual isSource
+        LocateFoundReexport uid' -> locateModule moduleMaps (hscSetActiveUnitId uid' env) comp_info exts targetFor modName noPkgQual isSource
         LocateFoundFile uid' file -> toModLocation uid' file
 
     lookupInPackageDB = do
diff --git a/ghcide/src/Development/IDE/Types/HscEnvEq.hs b/ghcide/src/Development/IDE/Types/HscEnvEq.hs
@@ -1,7 +1,7 @@
 {-# LANGUAGE CPP #-}
 module Development.IDE.Types.HscEnvEq
 (   HscEnvEq,
-    hscEnv, newHscEnvEq,
+    hscEnv, newHscEnvEq, envUnique,
     updateHscEnvEq,
     envPackageExports,
     envVisibleModuleNames,