From b8a3f9002cec59f6fad2d63276f80f664965a259 Mon Sep 17 00:00:00 2001 From: "Trevor L. McDonell" Date: Sat, 17 Apr 2021 00:57:56 +0200 Subject: [PATCH] add profile hooks --- accelerate.cabal | 59 +----- cbits/monitoring.c | 6 - .../Debug/Internal => icebox}/Monitoring.hs | 0 src/Data/Array/Accelerate/Array/Data.hs | 39 ++-- src/Data/Array/Accelerate/Array/Remote/LRU.hs | 12 +- .../Array/Accelerate/Array/Remote/Nursery.hs | 9 +- .../Array/Accelerate/Array/Remote/Table.hs | 8 +- src/Data/Array/Accelerate/Debug/Internal.hs | 14 +- .../Accelerate/Debug/Internal/Profile.hs | 172 ++++++++++++++++++ .../Array/Accelerate/Debug/Internal/Timed.hs | 4 +- .../Array/Accelerate/Debug/Internal/Trace.hs | 7 +- .../Array/Accelerate/Debug/Internal/Tracy.hs | 133 +++++++++++--- 12 files changed, 338 insertions(+), 125 deletions(-) rename {src/Data/Array/Accelerate/Debug/Internal => icebox}/Monitoring.hs (100%) create mode 100644 src/Data/Array/Accelerate/Debug/Internal/Profile.hs diff --git a/accelerate.cabal b/accelerate.cabal index 0972281a7..a42794aa5 100644 --- a/accelerate.cabal +++ b/accelerate.cabal @@ -234,47 +234,10 @@ flag debug * @dump-sched@: Print information related to execution scheduling. . -flag ekg - manual: True - default: False - description: - Enable hooks for monitoring the running application using EKG. Implies - @debug@ mode. In order to view the metrics, your application will need to - call @Data.Array.Accelerate.Debug.beginMonitoring@ before running any - Accelerate computations. This will launch the server on the local machine at - port 8000. - . - Alternatively, if you wish to configure the EKG monitoring server you can - initialise it like so: - . - > import Data.Array.Accelerate.Debug - > - > import System.Metrics - > import System.Remote.Monitoring - > - > main :: IO () - > main = do - > store <- initAccMetrics - > registerGcMetrics store -- optional - > - > server <- forkServerWith store "localhost" 8000 - > - > ... - . - Note that, as with any program utilising EKG, in order to collect Haskell GC - statistics, you must either run the program with: - . - > +RTS -T -RTS - . - or compile it with: - . - > -with-rtsopts=-T - . - flag prof manual: True default: False - description: Enable hooks for profiling the running application using Tracy + description: Enable hooks for profiling the running application flag bounds-checks manual: True @@ -324,6 +287,7 @@ library , template-haskell , terminal-size >= 0.3 , text >= 1.2 + , text-format >= 0.3 , transformers >= 0.3 , unique , unordered-containers >= 0.2 @@ -417,7 +381,7 @@ library Data.Array.Accelerate.Debug.Internal.Clock Data.Array.Accelerate.Debug.Internal.Flags Data.Array.Accelerate.Debug.Internal.Graph - Data.Array.Accelerate.Debug.Internal.Monitoring + Data.Array.Accelerate.Debug.Internal.Profile Data.Array.Accelerate.Debug.Internal.Stats Data.Array.Accelerate.Debug.Internal.Timed Data.Array.Accelerate.Debug.Internal.Trace @@ -521,33 +485,28 @@ library hs-source-dirs: src - if flag(debug) || flag(ekg) + if flag(debug) || flag(prof) cc-options: -DACCELERATE_DEBUG cpp-options: -DACCELERATE_DEBUG - if flag(ekg) - cpp-options: - -DACCELERATE_MONITORING - - build-depends: - async >= 2.0 - , ekg >= 0.1 - , ekg-core >= 0.1 - , text >= 1.0 - if flag(prof) + cc-options: + -DACCELERATE_PROFILE + cxx-options: -O3 -Wall -march=native -std=c++11 + -DACCELERATE_PROFILE -DTRACY_ENABLE -DTRACY_NO_SAMPLING cpp-options: + -DACCELERATE_PROFILE -DTRACY_ENABLE -DTRACY_NO_SAMPLING diff --git a/cbits/monitoring.c b/cbits/monitoring.c index 1cf412184..6a61760ca 100644 --- a/cbits/monitoring.c +++ b/cbits/monitoring.c @@ -23,12 +23,6 @@ /* These monitoring counters are globals which will be accessed from the * Haskell side. */ -int64_t DOUBLE_CACHE_ALIGNED __active_ns_llvm_native = 0; -int64_t DOUBLE_CACHE_ALIGNED __active_ns_llvm_ptx = 0; - -int64_t DOUBLE_CACHE_ALIGNED __current_bytes_remote = 0; -int64_t DOUBLE_CACHE_ALIGNED __current_bytes_nursery = 0; - int64_t DOUBLE_CACHE_ALIGNED __total_bytes_allocated_local = 0; int64_t DOUBLE_CACHE_ALIGNED __total_bytes_allocated_remote = 0; int64_t DOUBLE_CACHE_ALIGNED __total_bytes_copied_to_remote = 0; diff --git a/src/Data/Array/Accelerate/Debug/Internal/Monitoring.hs b/icebox/Monitoring.hs similarity index 100% rename from src/Data/Array/Accelerate/Debug/Internal/Monitoring.hs rename to icebox/Monitoring.hs diff --git a/src/Data/Array/Accelerate/Array/Data.hs b/src/Data/Array/Accelerate/Array/Data.hs index 89607065e..9ff7269b6 100644 --- a/src/Data/Array/Accelerate/Array/Data.hs +++ b/src/Data/Array/Accelerate/Array/Data.hs @@ -1,4 +1,5 @@ {-# LANGUAGE BangPatterns #-} +{-# LANGUAGE CPP #-} {-# LANGUAGE GADTs #-} {-# LANGUAGE MagicHash #-} {-# LANGUAGE OverloadedStrings #-} @@ -51,13 +52,16 @@ module Data.Array.Accelerate.Array.Data ( -- friends import Data.Array.Accelerate.Array.Unique -import Data.Array.Accelerate.Representation.Type import Data.Array.Accelerate.Error +import Data.Array.Accelerate.Representation.Type import Data.Array.Accelerate.Type import Data.Primitive.Vec +#ifdef ACCELERATE_PROFILE +import Data.Array.Accelerate.Lifetime +#endif import Data.Array.Accelerate.Debug.Internal.Flags -import Data.Array.Accelerate.Debug.Internal.Monitoring +import Data.Array.Accelerate.Debug.Internal.Profile import Data.Array.Accelerate.Debug.Internal.Trace @@ -281,15 +285,18 @@ runArrayData st = unsafePerformIO $ do -- intermediate arrays that contain meaningful data only on the device. -- allocateArray :: forall e. (HasCallStack, Storable e) => Int -> IO (UniqueArray e) -allocateArray !size - = internalCheck "size must be >= 0" (size >= 0) - $ newUniqueArray <=< unsafeInterleaveIO $ do - let bytes = size * sizeOf (undefined :: e) - new <- readIORef __mallocForeignPtrBytes - ptr <- new bytes - traceIO dump_gc $ printf "gc: allocated new host array (size=%d, ptr=%s)" bytes (show ptr) - didAllocateBytesLocal (fromIntegral bytes) - return (castForeignPtr ptr) +allocateArray !size = internalCheck "size must be >= 0" (size >= 0) $ do + arr <- newUniqueArray <=< unsafeInterleaveIO $ do + let bytes = size * sizeOf (undefined :: e) + new <- readIORef __mallocForeignPtrBytes + ptr <- new bytes + traceIO dump_gc $ build "gc: allocated new host array (size={}, ptr={})" (bytes, unsafeForeignPtrToPtr ptr) + local_memory_alloc (unsafeForeignPtrToPtr ptr) bytes + return (castForeignPtr ptr) +#ifdef ACCELERATE_PROFILE + addFinalizer (uniqueArrayData arr) (local_memory_free (unsafeUniqueArrayPtr arr)) +#endif + return arr -- | Register the given function as the callback to use to allocate new array -- data on the host containing the specified number of bytes. The returned array @@ -307,8 +314,8 @@ registerForeignPtrAllocator new = do __mallocForeignPtrBytes :: IORef (Int -> IO (ForeignPtr Word8)) __mallocForeignPtrBytes = unsafePerformIO $! newIORef mallocPlainForeignPtrBytesAligned --- | Allocate the given number of bytes with 16-byte alignment. This is --- essential for SIMD instructions. +-- | Allocate the given number of bytes with 64-byte (cache line) +-- alignment. This is essential for SIMD instructions. -- -- Additionally, we return a plain ForeignPtr, which unlike a regular ForeignPtr -- created with 'mallocForeignPtr' carries no finalisers. It is an error to try @@ -316,9 +323,9 @@ __mallocForeignPtrBytes = unsafePerformIO $! newIORef mallocPlainForeignPtrBytes -- since in Accelerate finalisers are handled using Lifetime -- mallocPlainForeignPtrBytesAligned :: Int -> IO (ForeignPtr a) -mallocPlainForeignPtrBytesAligned (I# size) = IO $ \s -> - case newAlignedPinnedByteArray# size 64# s of - (# s', mbarr# #) -> (# s', ForeignPtr (byteArrayContents# (unsafeCoerce# mbarr#)) (PlainPtr mbarr#) #) +mallocPlainForeignPtrBytesAligned (I# size#) = IO $ \s0 -> + case newAlignedPinnedByteArray# size# 64# s0 of + (# s1, mbarr# #) -> (# s1, ForeignPtr (byteArrayContents# (unsafeCoerce# mbarr#)) (PlainPtr mbarr#) #) liftArrayData :: Int -> TypeR e -> ArrayData e -> Q (TExp (ArrayData e)) diff --git a/src/Data/Array/Accelerate/Array/Remote/LRU.hs b/src/Data/Array/Accelerate/Array/Remote/LRU.hs index 6f09d7346..c82f06c75 100644 --- a/src/Data/Array/Accelerate/Array/Remote/LRU.hs +++ b/src/Data/Array/Accelerate/Array/Remote/LRU.hs @@ -43,12 +43,12 @@ import Data.Array.Accelerate.Array.Remote.Class import Data.Array.Accelerate.Array.Remote.Table ( StableArray, makeWeakArrayData ) import Data.Array.Accelerate.Array.Unique ( touchUniqueArray ) import Data.Array.Accelerate.Error ( internalError ) -import Data.Array.Accelerate.Representation.Elt -import Data.Array.Accelerate.Representation.Type +-- import Data.Array.Accelerate.Representation.Elt +-- import Data.Array.Accelerate.Representation.Type import Data.Array.Accelerate.Type import qualified Data.Array.Accelerate.Array.Remote.Table as Basic import qualified Data.Array.Accelerate.Debug.Internal.Flags as Debug -import qualified Data.Array.Accelerate.Debug.Internal.Monitoring as Debug +-- import qualified Data.Array.Accelerate.Debug.Internal.Profile as Debug import qualified Data.Array.Accelerate.Debug.Internal.Trace as Debug import Control.Concurrent.MVar ( MVar, newMVar, withMVar, takeMVar, putMVar, mkWeakMVar ) @@ -301,7 +301,7 @@ evictLRU !utbl !mt = trace "evictLRU/evicting-eldest-array" $ do Just arr -> do message (build "evictLRU/evicting {}" (show sa)) copyIfNecessary status n tp arr - liftIO $ Debug.didEvictBytes (remoteBytes tp n) + -- liftIO $ Debug.remote_memory_evict sa (remoteBytes tp n) liftIO $ Basic.freeStable @m mt sa liftIO $ HT.insert utbl sa (Used ts Evicted count tasks n tp weak_arr) return True @@ -322,8 +322,8 @@ evictLRU !utbl !mt = trace "evictLRU/evicting-eldest-array" $ do _ -> return prev eldest prev _ = return prev - remoteBytes :: SingleType e -> Int -> Int64 - remoteBytes tp n = fromIntegral (bytesElt (TupRsingle (SingleScalarType tp))) * fromIntegral n + -- remoteBytes :: SingleType e -> Int -> Int + -- remoteBytes tp n = bytesElt (TupRsingle (SingleScalarType tp)) * n evictable :: Status -> Bool evictable Clean = True diff --git a/src/Data/Array/Accelerate/Array/Remote/Nursery.hs b/src/Data/Array/Accelerate/Array/Remote/Nursery.hs index e773065b7..03ee9eced 100644 --- a/src/Data/Array/Accelerate/Array/Remote/Nursery.hs +++ b/src/Data/Array/Accelerate/Array/Remote/Nursery.hs @@ -20,7 +20,7 @@ module Data.Array.Accelerate.Array.Remote.Nursery ( -- friends import Data.Array.Accelerate.Error import Data.Array.Accelerate.Debug.Internal.Flags as Debug -import Data.Array.Accelerate.Debug.Internal.Monitoring as Debug +-- import Data.Array.Accelerate.Debug.Internal.Profile as Debug import Data.Array.Accelerate.Debug.Internal.Trace as Debug -- libraries @@ -75,7 +75,7 @@ lookup !key (Nursery !ref !_) = Just r -> case Seq.viewl r of v Seq.:< vs -> do - Debug.decreaseCurrentBytesNursery (fromIntegral key) + -- Debug.remote_memory_free_nursery v if Seq.null vs then return (Nothing, Just v) -- delete this entry from the map else return (Just vs, Just v) -- re-insert the tail @@ -89,7 +89,7 @@ lookup !key (Nursery !ref !_) = insert :: Int -> ptr Word8 -> Nursery ptr -> IO () insert !key !val (Nursery !ref _) = withMVar ref $ \nrs -> do - Debug.increaseCurrentBytesRemote (fromIntegral key) + -- Debug.remote_memory_alloc_nursery val key HT.mutate nrs key $ \case Nothing -> (Just (Seq.singleton val), ()) Just vs -> (Just (vs Seq.|> val), ()) @@ -103,8 +103,7 @@ cleanup delete !ref = do message "nursery cleanup" modifyMVar_ ref $ \nrs -> do HT.mapM_ (Seq.mapM delete . snd) nrs - Debug.setCurrentBytesNursery 0 - nrs' <- HT.new + nrs' <- HT.new return nrs' diff --git a/src/Data/Array/Accelerate/Array/Remote/Table.hs b/src/Data/Array/Accelerate/Array/Remote/Table.hs index 1629a855a..0baecf8cf 100644 --- a/src/Data/Array/Accelerate/Array/Remote/Table.hs +++ b/src/Data/Array/Accelerate/Array/Remote/Table.hs @@ -65,7 +65,7 @@ import Data.Array.Accelerate.Lifetime import Data.Array.Accelerate.Type import qualified Data.Array.Accelerate.Array.Remote.Nursery as N import qualified Data.Array.Accelerate.Debug.Internal.Flags as Debug -import qualified Data.Array.Accelerate.Debug.Internal.Monitoring as Debug +import qualified Data.Array.Accelerate.Debug.Internal.Profile as Debug import qualified Data.Array.Accelerate.Debug.Internal.Trace as Debug import GHC.Stack @@ -253,7 +253,7 @@ freeStable (MemoryTable !ref _ !nrs _) !sa = Just (RemoteArray !p !bytes _) -> do message (build "free/nursery: {} of {}" (show sa, showBytes bytes)) N.insert bytes (castRemotePtr @m p) nrs - Debug.decreaseCurrentBytesRemote (fromIntegral bytes) + -- Debug.remote_memory_free (unsafeRemotePtrToPtr @m p) return (Nothing, ()) @@ -274,7 +274,7 @@ insert mt@(MemoryTable !ref _ _ _) !tp !arr !ptr !bytes | SingleArrayDict <- sin key <- makeStableArray tp arr weak <- liftIO $ makeWeakArrayData tp arr () (Just $ freeStable @m mt key) message $ build "insert: {}" (show key) - liftIO $ Debug.increaseCurrentBytesRemote (fromIntegral bytes) + -- liftIO $ Debug.remote_memory_alloc (unsafeRemotePtrToPtr @m ptr) bytes liftIO $ withMVar ref $ \tbl -> HT.insert tbl key (RemoteArray (castRemotePtr @m ptr) bytes weak) @@ -312,7 +312,7 @@ clean mt@(MemoryTable _ weak_ref nrs _) = management "clean" nrs . liftIO $ do -- that finalizers are often significantly delayed, it is worth our while -- traversing the table and explicitly freeing any dead entires. -- - Debug.didRemoteGC + Debug.emit_remote_gc performGC yield mr <- deRefWeak weak_ref diff --git a/src/Data/Array/Accelerate/Debug/Internal.hs b/src/Data/Array/Accelerate/Debug/Internal.hs index 71a0a3c84..e260590de 100644 --- a/src/Data/Array/Accelerate/Debug/Internal.hs +++ b/src/Data/Array/Accelerate/Debug/Internal.hs @@ -17,7 +17,7 @@ module Data.Array.Accelerate.Debug.Internal ( debuggingIsEnabled, - monitoringIsEnabled, + profilingIsEnabled, boundsChecksAreEnabled, unsafeChecksAreEnabled, internalChecksAreEnabled, @@ -28,7 +28,7 @@ module Data.Array.Accelerate.Debug.Internal ( import Data.Array.Accelerate.Debug.Internal.Flags as Debug import Data.Array.Accelerate.Debug.Internal.Graph as Debug -import Data.Array.Accelerate.Debug.Internal.Monitoring as Debug +import Data.Array.Accelerate.Debug.Internal.Profile as Debug import Data.Array.Accelerate.Debug.Internal.Stats as Debug import Data.Array.Accelerate.Debug.Internal.Timed as Debug import Data.Array.Accelerate.Debug.Internal.Trace as Debug @@ -43,12 +43,12 @@ debuggingIsEnabled = True debuggingIsEnabled = False #endif -{-# INLINE monitoringIsEnabled #-} -monitoringIsEnabled :: Bool -#ifdef ACCELERATE_MONITORING -monitoringIsEnabled = True +{-# INLINE profilingIsEnabled #-} +profilingIsEnabled :: Bool +#ifdef ACCELERATE_PROFILE +profilingIsEnabled = True #else -monitoringIsEnabled = False +profilingIsEnabled = False #endif {-# INLINE boundsChecksAreEnabled #-} diff --git a/src/Data/Array/Accelerate/Debug/Internal/Profile.hs b/src/Data/Array/Accelerate/Debug/Internal/Profile.hs new file mode 100644 index 000000000..ac770572c --- /dev/null +++ b/src/Data/Array/Accelerate/Debug/Internal/Profile.hs @@ -0,0 +1,172 @@ +{-# LANGUAGE CPP #-} +{-# LANGUAGE ForeignFunctionInterface #-} +{-# LANGUAGE TemplateHaskell #-} +{-# OPTIONS_GHC -fobject-code #-} +{-# OPTIONS_HADDOCK hide #-} +-- | +-- Module : Data.Array.Accelerate.Debug.Internal.Profile +-- Copyright : [2016..2020] The Accelerate Team +-- License : BSD3 +-- +-- Maintainer : Trevor L. McDonell +-- Stability : experimental +-- Portability : non-portable (GHC extensions) +-- + +module Data.Array.Accelerate.Debug.Internal.Profile ( + + local_memory_alloc, + local_memory_free, + + remote_memory_alloc, remote_memory_alloc_nursery, + remote_memory_free, remote_memory_free_nursery, + remote_memory_evict, + + memcpy_to_remote, + memcpy_from_remote, + + emit_remote_gc, + +) where + +#ifdef ACCELERATE_DEBUG +import Control.Monad +#endif +#ifdef ACCELERATE_PROFILE +import qualified Data.Array.Accelerate.Debug.Internal.Tracy as Tracy +#endif + +import Data.Atomic ( Atomic ) +import qualified Data.Atomic as Atomic + +import Data.Char +import Foreign.C.String +import Foreign.Ptr +import Language.Haskell.TH +import Language.Haskell.TH.Syntax + +import GHC.Ptr + + +-- Embed some string data into the constant section and grab a pointer +-- directly to it. +-- +-- XXX: This only allows us to track a single nursery, but in reality it is +-- one per remote memory space. May or may not be useful to separate this. +-- +runQ $ sequence + [ sigD (mkName "___nursery") (conT ''CString) + , valD (varP (mkName "___nursery")) (normalB (conE 'Ptr `appE` litE (stringPrimL (map (fromIntegral . ord) "nursery\0")))) [] + ] + +-- Allocations in the local memory space +-- +{-# INLINE local_memory_alloc #-} +{-# INLINE local_memory_free #-} +local_memory_alloc :: Ptr a -> Int -> IO () +local_memory_free :: Ptr a -> IO () +#ifndef ACCELERATE_DEBUG +local_memory_alloc _ _ = return () +local_memory_free _ = return () +#else + +local_memory_alloc _p n = do +#ifdef ACCELERATE_PROFILE + Tracy.emit_memory_alloc _p (fromIntegral n) 0 +#endif + void $ Atomic.add __total_bytes_allocated_local (fromIntegral n) + +local_memory_free _p = do +#ifdef ACCELERATE_PROFILE + Tracy.emit_memory_free _p 0 +#endif + return () +#endif + + +-- Allocations in the remote memory space +-- +{-# INLINE remote_memory_alloc #-} +{-# INLINE remote_memory_free #-} +{-# INLINE remote_memory_evict #-} +remote_memory_alloc :: CString -> Ptr a -> Int -> IO () +remote_memory_free :: CString -> Ptr a -> IO () +remote_memory_evict :: CString -> Ptr a -> Int -> IO () +#ifndef ACCELERATE_DEBUG +remote_memory_alloc _ _ _ = return () +remote_memory_free _ _ = return () +remote_memory_evict _ _ _ = return () +#else +remote_memory_alloc _name _ptr bytes = do +#ifdef ACCELERATE_PROFILE + Tracy.emit_memory_alloc_named _ptr (fromIntegral bytes) 0 _name +#endif + void $ Atomic.add __total_bytes_allocated_remote (fromIntegral bytes) + +remote_memory_free _name _ptr = do +#ifdef ACCELERATE_PROFILE + Tracy.emit_memory_free_named _ptr 0 _name +#endif + return () + +remote_memory_evict name ptr bytes = do + void $ Atomic.add __num_evictions 1 + void $ Atomic.add __total_bytes_evicted_from_remote (fromIntegral bytes) + remote_memory_free name ptr + memcpy_from_remote bytes +#endif + +remote_memory_alloc_nursery :: Ptr a -> Int -> IO () +remote_memory_free_nursery :: Ptr a -> IO () +#ifndef ACCELERATE_PROFILE +remote_memory_alloc_nursery _ _ = return () +remote_memory_free_nursery _ = return () +#else +remote_memory_alloc_nursery p n = Tracy.emit_memory_alloc_named p (fromIntegral n) 0 ___nursery +remote_memory_free_nursery p = Tracy.emit_memory_free_named p 0 ___nursery +#endif + + +-- Data transfer between memory spaces +-- +{-# INLINE memcpy_to_remote #-} +{-# INLINE memcpy_from_remote #-} +memcpy_to_remote :: Int -> IO () +memcpy_from_remote :: Int -> IO () +#ifndef ACCELERATE_DEBUG +memcpy_to_remote _ = return () +memcpy_from_remote _ = return () +#else +memcpy_to_remote n = void $ Atomic.add __total_bytes_copied_to_remote (fromIntegral n) +memcpy_from_remote n = void $ Atomic.add __total_bytes_copied_from_remote (fromIntegral n) +#endif + + +-- Performed a major GC of the remote memory space +-- +{-# INLINE emit_remote_gc #-} +emit_remote_gc :: IO () +#ifndef ACCELERATE_DEBUG +emit_remote_gc = return () +#else +emit_remote_gc = void $ Atomic.add __num_remote_gcs 1 +#endif + + +-- Monitoring variables +-- -------------------- + +foreign import ccall "&__total_bytes_allocated_local" __total_bytes_allocated_local :: Atomic -- bytes allocated in the local (CPU) memory space +foreign import ccall "&__total_bytes_allocated_remote" __total_bytes_allocated_remote :: Atomic -- bytes allocated in the remote memory space (if it is separate, e.g. GPU) +foreign import ccall "&__total_bytes_copied_to_remote" __total_bytes_copied_to_remote :: Atomic -- bytes copied to the remote memory space +foreign import ccall "&__total_bytes_copied_from_remote" __total_bytes_copied_from_remote :: Atomic -- bytes copied from the remote memory space +foreign import ccall "&__total_bytes_evicted_from_remote" __total_bytes_evicted_from_remote :: Atomic -- total bytes copied from the remote due to evictions +foreign import ccall "&__num_remote_gcs" __num_remote_gcs :: Atomic -- number of times the remote memory space was forcibly garbage collected +foreign import ccall "&__num_evictions" __num_evictions :: Atomic -- number of LRU eviction events + +-- SEE: [linking to .c files] +-- +runQ $ do + addForeignFilePath LangC "cbits/monitoring.c" + return [] + diff --git a/src/Data/Array/Accelerate/Debug/Internal/Timed.hs b/src/Data/Array/Accelerate/Debug/Internal/Timed.hs index 1bcc4f80a..8113e3993 100644 --- a/src/Data/Array/Accelerate/Debug/Internal/Timed.hs +++ b/src/Data/Array/Accelerate/Debug/Internal/Timed.hs @@ -22,14 +22,14 @@ import Data.Array.Accelerate.Debug.Internal.Flags import Data.Array.Accelerate.Debug.Internal.Trace import Control.Monad.Trans ( MonadIO ) +import Data.Text.Lazy.Builder +import Data.Text.Format #if ACCELERATE_DEBUG import Data.Array.Accelerate.Debug.Internal.Clock import Control.Applicative import Control.Monad.Trans ( liftIO ) -import Data.Text.Lazy.Builder -import Data.Text.Format import System.CPUTime import Prelude diff --git a/src/Data/Array/Accelerate/Debug/Internal/Trace.hs b/src/Data/Array/Accelerate/Debug/Internal/Trace.hs index a94e1e6d7..07087ee1d 100644 --- a/src/Data/Array/Accelerate/Debug/Internal/Trace.hs +++ b/src/Data/Array/Accelerate/Debug/Internal/Trace.hs @@ -26,11 +26,12 @@ module Data.Array.Accelerate.Debug.Internal.Trace ( import Data.Array.Accelerate.Debug.Internal.Flags -#ifdef ACCELERATE_DEBUG -import Data.Array.Accelerate.Debug.Internal.Clock +import Data.Text.Format import Data.Text.Lazy.Builder import Data.Text.Lazy.Builder.RealFloat -import Data.Text.Format + +#ifdef ACCELERATE_DEBUG +import Data.Array.Accelerate.Debug.Internal.Clock import System.IO import System.IO.Unsafe #endif diff --git a/src/Data/Array/Accelerate/Debug/Internal/Tracy.hs b/src/Data/Array/Accelerate/Debug/Internal/Tracy.hs index 9e019d689..3f7b43f7a 100644 --- a/src/Data/Array/Accelerate/Debug/Internal/Tracy.hs +++ b/src/Data/Array/Accelerate/Debug/Internal/Tracy.hs @@ -1,3 +1,4 @@ +{-# LANGUAGE CPP #-} {-# LANGUAGE ForeignFunctionInterface #-} {-# LANGUAGE TemplateHaskell #-} {-# OPTIONS_GHC -fobject-code #-} @@ -18,42 +19,46 @@ import Data.Word import Foreign.C.String import Foreign.C.Types import Foreign.Ptr + +#ifdef ACCELERATE_PROFILE import Language.Haskell.TH.Syntax +#endif +type Zone = Word64 +type SrcLoc = Word64 -type TracyCZoneCtx = Word64 -type TracyCSrcLoc = Word64 +#ifdef ACCELERATE_PROFILE -foreign import ccall unsafe "___tracy_init_thread" ___tracy_init_thread :: IO () -foreign import ccall unsafe "___tracy_set_thread_name" ___tracy_set_thread_name :: CString -> IO () +foreign import ccall unsafe "___tracy_init_thread" init_thread :: IO () +foreign import ccall unsafe "___tracy_set_thread_name" set_thread_name :: CString -> IO () -foreign import ccall unsafe "___tracy_alloc_srcloc" ___tracy_alloc_srcloc :: Word32 -> CString -> CSize -> CString -> CSize -> IO TracyCSrcLoc -foreign import ccall unsafe "___tracy_alloc_srcloc_name" ___tracy_alloc_srcloc_name :: Word32 -> CString -> CSize -> CString -> CSize -> CString -> CSize -> IO TracyCSrcLoc +foreign import ccall unsafe "___tracy_alloc_srcloc" alloc_srcloc :: Word32 -> CString -> CSize -> CString -> CSize -> IO SrcLoc +foreign import ccall unsafe "___tracy_alloc_srcloc_name" alloc_srcloc_name :: Word32 -> CString -> CSize -> CString -> CSize -> CString -> CSize -> IO SrcLoc -foreign import ccall unsafe "___tracy_emit_zone_begin_alloc" ___tracy_emit_zone_begin_alloc :: TracyCSrcLoc -> CInt -> IO TracyCZoneCtx -foreign import ccall unsafe "___tracy_emit_zone_end" ___tracy_emit_zone_end :: TracyCZoneCtx -> IO () -foreign import ccall unsafe "___tracy_emit_zone_text" ___tracy_emit_zone_text :: TracyCZoneCtx -> CString -> CSize -> IO () -foreign import ccall unsafe "___tracy_emit_zone_name" ___tracy_emit_zone_name :: TracyCZoneCtx -> CString -> CSize -> IO () -foreign import ccall unsafe "___tracy_emit_zone_color" ___tracy_emit_zone_color :: TracyCZoneCtx -> Word32 -> IO () -foreign import ccall unsafe "___tracy_emit_zone_value" ___tracy_emit_zone_value :: TracyCZoneCtx -> Word64 -> IO () +foreign import ccall unsafe "___tracy_emit_zone_begin_alloc" emit_zone_begin :: SrcLoc -> CInt -> IO Zone +foreign import ccall unsafe "___tracy_emit_zone_end" emit_zone_end :: Zone -> IO () +foreign import ccall unsafe "___tracy_emit_zone_text" emit_zone_text :: Zone -> CString -> CSize -> IO () +foreign import ccall unsafe "___tracy_emit_zone_name" emit_zone_name :: Zone -> CString -> CSize -> IO () +foreign import ccall unsafe "___tracy_emit_zone_color" emit_zone_color :: Zone -> Word32 -> IO () +foreign import ccall unsafe "___tracy_emit_zone_value" emit_zone_value :: Zone -> Word64 -> IO () -foreign import ccall unsafe "___tracy_emit_memory_alloc" ___tracy_emit_memory_alloc :: Ptr () -> CSize -> CInt -> IO () -foreign import ccall unsafe "___tracy_emit_memory_free" ___tracy_emit_memory_free :: Ptr () -> CInt -> IO () -foreign import ccall unsafe "___tracy_emit_memory_alloc_named" ___tracy_emit_memory_alloc_named :: Ptr () -> CSize -> CInt -> CString -> IO () -foreign import ccall unsafe "___tracy_emit_memory_free_named" ___tracy_emit_memory_free_named :: Ptr () -> CInt -> CString -> IO () +foreign import ccall unsafe "___tracy_emit_memory_alloc" emit_memory_alloc :: Ptr a -> CSize -> CInt -> IO () +foreign import ccall unsafe "___tracy_emit_memory_free" emit_memory_free :: Ptr a -> CInt -> IO () +foreign import ccall unsafe "___tracy_emit_memory_alloc_named" emit_memory_alloc_named :: Ptr a -> CSize -> CInt -> CString -> IO () +foreign import ccall unsafe "___tracy_emit_memory_free_named" emit_memory_free_named :: Ptr a -> CInt -> CString -> IO () -foreign import ccall unsafe "___tracy_emit_message" ___tracy_emit_message :: CString -> CSize -> CInt -> IO () -foreign import ccall unsafe "___tracy_emit_messageC" ___tracy_emit_message_colour :: CString -> CSize -> Word32 -> CInt -> IO () -foreign import ccall unsafe "___tracy_emit_messageL" ___tracy_emit_message_literal :: CString -> CInt -> IO () -foreign import ccall unsafe "___tracy_emit_messageLC" ___tracy_emit_message_literal_colour :: CString -> Word32 -> CInt -> IO () +foreign import ccall unsafe "___tracy_emit_message" emit_message :: CString -> CSize -> CInt -> IO () +foreign import ccall unsafe "___tracy_emit_messageC" emit_message_colour :: CString -> CSize -> Word32 -> CInt -> IO () +foreign import ccall unsafe "___tracy_emit_messageL" emit_message_literal :: CString -> CInt -> IO () +foreign import ccall unsafe "___tracy_emit_messageLC" emit_message_literal_colour :: CString -> Word32 -> CInt -> IO () -foreign import ccall unsafe "___tracy_emit_frame_mark" ___tracy_emit_frame_mark :: CString -> IO () -foreign import ccall unsafe "___tracy_emit_frame_mark_start" ___tracy_emit_frame_mark_start :: CString -> IO () -foreign import ccall unsafe "___tracy_emit_frame_mark_end" ___tracy_emit_frame_mark_end :: CString -> IO () -foreign import ccall unsafe "___tracy_emit_frame_image" ___tracy_emit_frame_image :: Ptr () -> Word16 -> Word16 -> Word8 -> CInt -> IO () -- width height offset flip +foreign import ccall unsafe "___tracy_emit_frame_mark" emit_frame_mark :: CString -> IO () +foreign import ccall unsafe "___tracy_emit_frame_mark_start" emit_frame_mark_start :: CString -> IO () +foreign import ccall unsafe "___tracy_emit_frame_mark_end" emit_frame_mark_end :: CString -> IO () +foreign import ccall unsafe "___tracy_emit_frame_image" emit_frame_image :: Ptr Word32 -> Word16 -> Word16 -> Word8 -> CInt -> IO () -- width height offset flip; dimensions must be divisible by 4! -foreign import ccall unsafe "___tracy_emit_plot" ___tracy_emit_plot :: CString -> Double -> IO () -foreign import ccall unsafe "___tracy_emit_message_appinfo" ___tracy_emit_message_appinfo :: CString -> CSize -> IO () +foreign import ccall unsafe "___tracy_emit_plot" emit_plot :: CString -> Double -> IO () +foreign import ccall unsafe "___tracy_emit_message_appinfo" emit_message_appinfo :: CString -> CSize -> IO () -- SEE: [linking to .c files] -- @@ -61,3 +66,79 @@ runQ $ do addForeignFilePath LangCxx "cbits/tracy/TracyClient.cpp" return [] +#else + +init_thread :: IO () +init_thread = return () + +set_thread_name :: CString -> IO () +set_thread_name _ = return () + +alloc_srcloc :: Word32 -> CString -> CSize -> CString -> CSize -> IO SrcLoc +alloc_srcloc _ _ _ _ _ = return 0 + +alloc_srcloc_name :: Word32 -> CString -> CSize -> CString -> CSize -> CString -> CSize -> IO SrcLoc +alloc_srcloc_name _ _ _ _ _ _ _ = return 0 + +emit_zone_begin :: SrcLoc -> CInt -> IO Zone +emit_zone_begin _ _ = return 0 + +emit_zone_end :: Zone -> IO () +emit_zone_end _ = return () + +emit_zone_text :: Zone -> CString -> CSize -> IO () +emit_zone_text _ _ _ = return () + +emit_zone_name :: Zone -> CString -> CSize -> IO () +emit_zone_name _ _ _ = return () + +emit_zone_color :: Zone -> Word32 -> IO () +emit_zone_color _ _ = return () + +emit_zone_value :: Zone -> Word64 -> IO () +emit_zone_value _ _ = return () + +emit_memory_alloc :: Ptr a -> CSize -> CInt -> IO () +emit_memory_alloc _ _ _ = return () + +emit_memory_free :: Ptr a -> CInt -> IO () +emit_memory_free _ _ = return () + +emit_memory_alloc_named :: Ptr a -> CSize -> CInt -> CString -> IO () +emit_memory_alloc_named _ _ _ _ = return () + +emit_memory_free_named :: Ptr a -> CInt -> CString -> IO () +emit_memory_free_named _ _ _ = return () + +emit_message :: CString -> CSize -> CInt -> IO () +emit_message _ _ _ = return () + +emit_message_colour :: CString -> CSize -> Word32 -> CInt -> IO () +emit_message_colour _ _ _ _ = return () + +emit_message_literal :: CString -> CInt -> IO () +emit_message_literal _ _ = return () + +emit_message_literal_colour :: CString -> Word32 -> CInt -> IO () +emit_message_literal_colour _ _ _ = return () + +emit_frame_mark :: CString -> IO () +emit_frame_mark _ = return () + +emit_frame_mark_start :: CString -> IO () +emit_frame_mark_start _ = return () + +emit_frame_mark_end :: CString -> IO () +emit_frame_mark_end _ = return () + +emit_frame_image :: Ptr Word32 -> Word16 -> Word16 -> Word8 -> CInt -> IO () +emit_frame_image _ _ _ _ _ = return () + +emit_plot :: CString -> Double -> IO () +emit_plot _ _ = return () + +emit_message_appinfo :: CString -> CSize -> IO () +emit_message_appinfo _ _ = return () + +#endif +