Skip to content

Commit

Permalink
add profile hooks
Browse files Browse the repository at this point in the history
  • Loading branch information
tmcdonell committed Apr 16, 2021
1 parent 0b27a17 commit b8a3f90
Show file tree
Hide file tree
Showing 12 changed files with 338 additions and 125 deletions.
59 changes: 9 additions & 50 deletions accelerate.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -234,47 +234,10 @@ flag debug
* @dump-sched@: Print information related to execution scheduling.
.

flag ekg
manual: True
default: False
description:
Enable hooks for monitoring the running application using EKG. Implies
@debug@ mode. In order to view the metrics, your application will need to
call @Data.Array.Accelerate.Debug.beginMonitoring@ before running any
Accelerate computations. This will launch the server on the local machine at
port 8000.
.
Alternatively, if you wish to configure the EKG monitoring server you can
initialise it like so:
.
> import Data.Array.Accelerate.Debug
>
> import System.Metrics
> import System.Remote.Monitoring
>
> main :: IO ()
> main = do
> store <- initAccMetrics
> registerGcMetrics store -- optional
>
> server <- forkServerWith store "localhost" 8000
>
> ...
.
Note that, as with any program utilising EKG, in order to collect Haskell GC
statistics, you must either run the program with:
.
> +RTS -T -RTS
.
or compile it with:
.
> -with-rtsopts=-T
.

flag prof
manual: True
default: False
description: Enable hooks for profiling the running application using Tracy
description: Enable hooks for profiling the running application

flag bounds-checks
manual: True
Expand Down Expand Up @@ -324,6 +287,7 @@ library
, template-haskell
, terminal-size >= 0.3
, text >= 1.2
, text-format >= 0.3
, transformers >= 0.3
, unique
, unordered-containers >= 0.2
Expand Down Expand Up @@ -417,7 +381,7 @@ library
Data.Array.Accelerate.Debug.Internal.Clock
Data.Array.Accelerate.Debug.Internal.Flags
Data.Array.Accelerate.Debug.Internal.Graph
Data.Array.Accelerate.Debug.Internal.Monitoring
Data.Array.Accelerate.Debug.Internal.Profile
Data.Array.Accelerate.Debug.Internal.Stats
Data.Array.Accelerate.Debug.Internal.Timed
Data.Array.Accelerate.Debug.Internal.Trace
Expand Down Expand Up @@ -521,33 +485,28 @@ library
hs-source-dirs:
src

if flag(debug) || flag(ekg)
if flag(debug) || flag(prof)
cc-options:
-DACCELERATE_DEBUG

cpp-options:
-DACCELERATE_DEBUG

if flag(ekg)
cpp-options:
-DACCELERATE_MONITORING

build-depends:
async >= 2.0
, ekg >= 0.1
, ekg-core >= 0.1
, text >= 1.0

if flag(prof)
cc-options:
-DACCELERATE_PROFILE

cxx-options:
-O3
-Wall
-march=native
-std=c++11
-DACCELERATE_PROFILE
-DTRACY_ENABLE
-DTRACY_NO_SAMPLING

cpp-options:
-DACCELERATE_PROFILE
-DTRACY_ENABLE
-DTRACY_NO_SAMPLING

Expand Down
6 changes: 0 additions & 6 deletions cbits/monitoring.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,6 @@
/* These monitoring counters are globals which will be accessed from the
* Haskell side.
*/
int64_t DOUBLE_CACHE_ALIGNED __active_ns_llvm_native = 0;
int64_t DOUBLE_CACHE_ALIGNED __active_ns_llvm_ptx = 0;

int64_t DOUBLE_CACHE_ALIGNED __current_bytes_remote = 0;
int64_t DOUBLE_CACHE_ALIGNED __current_bytes_nursery = 0;

int64_t DOUBLE_CACHE_ALIGNED __total_bytes_allocated_local = 0;
int64_t DOUBLE_CACHE_ALIGNED __total_bytes_allocated_remote = 0;
int64_t DOUBLE_CACHE_ALIGNED __total_bytes_copied_to_remote = 0;
Expand Down
File renamed without changes.
39 changes: 23 additions & 16 deletions src/Data/Array/Accelerate/Array/Data.hs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE CPP #-}
{-# LANGUAGE GADTs #-}
{-# LANGUAGE MagicHash #-}
{-# LANGUAGE OverloadedStrings #-}
Expand Down Expand Up @@ -51,13 +52,16 @@ module Data.Array.Accelerate.Array.Data (

-- friends
import Data.Array.Accelerate.Array.Unique
import Data.Array.Accelerate.Representation.Type
import Data.Array.Accelerate.Error
import Data.Array.Accelerate.Representation.Type
import Data.Array.Accelerate.Type
import Data.Primitive.Vec
#ifdef ACCELERATE_PROFILE
import Data.Array.Accelerate.Lifetime
#endif

import Data.Array.Accelerate.Debug.Internal.Flags
import Data.Array.Accelerate.Debug.Internal.Monitoring
import Data.Array.Accelerate.Debug.Internal.Profile
import Data.Array.Accelerate.Debug.Internal.Trace


Expand Down Expand Up @@ -281,15 +285,18 @@ runArrayData st = unsafePerformIO $ do
-- intermediate arrays that contain meaningful data only on the device.
--
allocateArray :: forall e. (HasCallStack, Storable e) => Int -> IO (UniqueArray e)
allocateArray !size
= internalCheck "size must be >= 0" (size >= 0)
$ newUniqueArray <=< unsafeInterleaveIO $ do
let bytes = size * sizeOf (undefined :: e)
new <- readIORef __mallocForeignPtrBytes
ptr <- new bytes
traceIO dump_gc $ printf "gc: allocated new host array (size=%d, ptr=%s)" bytes (show ptr)
didAllocateBytesLocal (fromIntegral bytes)
return (castForeignPtr ptr)
allocateArray !size = internalCheck "size must be >= 0" (size >= 0) $ do
arr <- newUniqueArray <=< unsafeInterleaveIO $ do
let bytes = size * sizeOf (undefined :: e)
new <- readIORef __mallocForeignPtrBytes
ptr <- new bytes
traceIO dump_gc $ build "gc: allocated new host array (size={}, ptr={})" (bytes, unsafeForeignPtrToPtr ptr)
local_memory_alloc (unsafeForeignPtrToPtr ptr) bytes
return (castForeignPtr ptr)
#ifdef ACCELERATE_PROFILE
addFinalizer (uniqueArrayData arr) (local_memory_free (unsafeUniqueArrayPtr arr))
#endif
return arr

-- | Register the given function as the callback to use to allocate new array
-- data on the host containing the specified number of bytes. The returned array
Expand All @@ -307,18 +314,18 @@ registerForeignPtrAllocator new = do
__mallocForeignPtrBytes :: IORef (Int -> IO (ForeignPtr Word8))
__mallocForeignPtrBytes = unsafePerformIO $! newIORef mallocPlainForeignPtrBytesAligned

-- | Allocate the given number of bytes with 16-byte alignment. This is
-- essential for SIMD instructions.
-- | Allocate the given number of bytes with 64-byte (cache line)
-- alignment. This is essential for SIMD instructions.
--
-- Additionally, we return a plain ForeignPtr, which unlike a regular ForeignPtr
-- created with 'mallocForeignPtr' carries no finalisers. It is an error to try
-- to add a finaliser to the plain ForeignPtr. For our purposes this is fine,
-- since in Accelerate finalisers are handled using Lifetime
--
mallocPlainForeignPtrBytesAligned :: Int -> IO (ForeignPtr a)
mallocPlainForeignPtrBytesAligned (I# size) = IO $ \s ->
case newAlignedPinnedByteArray# size 64# s of
(# s', mbarr# #) -> (# s', ForeignPtr (byteArrayContents# (unsafeCoerce# mbarr#)) (PlainPtr mbarr#) #)
mallocPlainForeignPtrBytesAligned (I# size#) = IO $ \s0 ->
case newAlignedPinnedByteArray# size# 64# s0 of
(# s1, mbarr# #) -> (# s1, ForeignPtr (byteArrayContents# (unsafeCoerce# mbarr#)) (PlainPtr mbarr#) #)


liftArrayData :: Int -> TypeR e -> ArrayData e -> Q (TExp (ArrayData e))
Expand Down
12 changes: 6 additions & 6 deletions src/Data/Array/Accelerate/Array/Remote/LRU.hs
Original file line number Diff line number Diff line change
Expand Up @@ -43,12 +43,12 @@ import Data.Array.Accelerate.Array.Remote.Class
import Data.Array.Accelerate.Array.Remote.Table ( StableArray, makeWeakArrayData )
import Data.Array.Accelerate.Array.Unique ( touchUniqueArray )
import Data.Array.Accelerate.Error ( internalError )
import Data.Array.Accelerate.Representation.Elt
import Data.Array.Accelerate.Representation.Type
-- import Data.Array.Accelerate.Representation.Elt
-- import Data.Array.Accelerate.Representation.Type
import Data.Array.Accelerate.Type
import qualified Data.Array.Accelerate.Array.Remote.Table as Basic
import qualified Data.Array.Accelerate.Debug.Internal.Flags as Debug
import qualified Data.Array.Accelerate.Debug.Internal.Monitoring as Debug
-- import qualified Data.Array.Accelerate.Debug.Internal.Profile as Debug
import qualified Data.Array.Accelerate.Debug.Internal.Trace as Debug

import Control.Concurrent.MVar ( MVar, newMVar, withMVar, takeMVar, putMVar, mkWeakMVar )
Expand Down Expand Up @@ -301,7 +301,7 @@ evictLRU !utbl !mt = trace "evictLRU/evicting-eldest-array" $ do
Just arr -> do
message (build "evictLRU/evicting {}" (show sa))
copyIfNecessary status n tp arr
liftIO $ Debug.didEvictBytes (remoteBytes tp n)
-- liftIO $ Debug.remote_memory_evict sa (remoteBytes tp n)
liftIO $ Basic.freeStable @m mt sa
liftIO $ HT.insert utbl sa (Used ts Evicted count tasks n tp weak_arr)
return True
Expand All @@ -322,8 +322,8 @@ evictLRU !utbl !mt = trace "evictLRU/evicting-eldest-array" $ do
_ -> return prev
eldest prev _ = return prev

remoteBytes :: SingleType e -> Int -> Int64
remoteBytes tp n = fromIntegral (bytesElt (TupRsingle (SingleScalarType tp))) * fromIntegral n
-- remoteBytes :: SingleType e -> Int -> Int
-- remoteBytes tp n = bytesElt (TupRsingle (SingleScalarType tp)) * n

evictable :: Status -> Bool
evictable Clean = True
Expand Down
9 changes: 4 additions & 5 deletions src/Data/Array/Accelerate/Array/Remote/Nursery.hs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ module Data.Array.Accelerate.Array.Remote.Nursery (
-- friends
import Data.Array.Accelerate.Error
import Data.Array.Accelerate.Debug.Internal.Flags as Debug
import Data.Array.Accelerate.Debug.Internal.Monitoring as Debug
-- import Data.Array.Accelerate.Debug.Internal.Profile as Debug
import Data.Array.Accelerate.Debug.Internal.Trace as Debug

-- libraries
Expand Down Expand Up @@ -75,7 +75,7 @@ lookup !key (Nursery !ref !_) =
Just r ->
case Seq.viewl r of
v Seq.:< vs -> do
Debug.decreaseCurrentBytesNursery (fromIntegral key)
-- Debug.remote_memory_free_nursery v
if Seq.null vs
then return (Nothing, Just v) -- delete this entry from the map
else return (Just vs, Just v) -- re-insert the tail
Expand All @@ -89,7 +89,7 @@ lookup !key (Nursery !ref !_) =
insert :: Int -> ptr Word8 -> Nursery ptr -> IO ()
insert !key !val (Nursery !ref _) =
withMVar ref $ \nrs -> do
Debug.increaseCurrentBytesRemote (fromIntegral key)
-- Debug.remote_memory_alloc_nursery val key
HT.mutate nrs key $ \case
Nothing -> (Just (Seq.singleton val), ())
Just vs -> (Just (vs Seq.|> val), ())
Expand All @@ -103,8 +103,7 @@ cleanup delete !ref = do
message "nursery cleanup"
modifyMVar_ ref $ \nrs -> do
HT.mapM_ (Seq.mapM delete . snd) nrs
Debug.setCurrentBytesNursery 0
nrs' <- HT.new
nrs' <- HT.new
return nrs'


Expand Down
8 changes: 4 additions & 4 deletions src/Data/Array/Accelerate/Array/Remote/Table.hs
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ import Data.Array.Accelerate.Lifetime
import Data.Array.Accelerate.Type
import qualified Data.Array.Accelerate.Array.Remote.Nursery as N
import qualified Data.Array.Accelerate.Debug.Internal.Flags as Debug
import qualified Data.Array.Accelerate.Debug.Internal.Monitoring as Debug
import qualified Data.Array.Accelerate.Debug.Internal.Profile as Debug
import qualified Data.Array.Accelerate.Debug.Internal.Trace as Debug

import GHC.Stack
Expand Down Expand Up @@ -253,7 +253,7 @@ freeStable (MemoryTable !ref _ !nrs _) !sa =
Just (RemoteArray !p !bytes _) -> do
message (build "free/nursery: {} of {}" (show sa, showBytes bytes))
N.insert bytes (castRemotePtr @m p) nrs
Debug.decreaseCurrentBytesRemote (fromIntegral bytes)
-- Debug.remote_memory_free (unsafeRemotePtrToPtr @m p)

return (Nothing, ())

Expand All @@ -274,7 +274,7 @@ insert mt@(MemoryTable !ref _ _ _) !tp !arr !ptr !bytes | SingleArrayDict <- sin
key <- makeStableArray tp arr
weak <- liftIO $ makeWeakArrayData tp arr () (Just $ freeStable @m mt key)
message $ build "insert: {}" (show key)
liftIO $ Debug.increaseCurrentBytesRemote (fromIntegral bytes)
-- liftIO $ Debug.remote_memory_alloc (unsafeRemotePtrToPtr @m ptr) bytes
liftIO $ withMVar ref $ \tbl -> HT.insert tbl key (RemoteArray (castRemotePtr @m ptr) bytes weak)


Expand Down Expand Up @@ -312,7 +312,7 @@ clean mt@(MemoryTable _ weak_ref nrs _) = management "clean" nrs . liftIO $ do
-- that finalizers are often significantly delayed, it is worth our while
-- traversing the table and explicitly freeing any dead entires.
--
Debug.didRemoteGC
Debug.emit_remote_gc
performGC
yield
mr <- deRefWeak weak_ref
Expand Down
14 changes: 7 additions & 7 deletions src/Data/Array/Accelerate/Debug/Internal.hs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
module Data.Array.Accelerate.Debug.Internal (

debuggingIsEnabled,
monitoringIsEnabled,
profilingIsEnabled,
boundsChecksAreEnabled,
unsafeChecksAreEnabled,
internalChecksAreEnabled,
Expand All @@ -28,7 +28,7 @@ module Data.Array.Accelerate.Debug.Internal (

import Data.Array.Accelerate.Debug.Internal.Flags as Debug
import Data.Array.Accelerate.Debug.Internal.Graph as Debug
import Data.Array.Accelerate.Debug.Internal.Monitoring as Debug
import Data.Array.Accelerate.Debug.Internal.Profile as Debug
import Data.Array.Accelerate.Debug.Internal.Stats as Debug
import Data.Array.Accelerate.Debug.Internal.Timed as Debug
import Data.Array.Accelerate.Debug.Internal.Trace as Debug
Expand All @@ -43,12 +43,12 @@ debuggingIsEnabled = True
debuggingIsEnabled = False
#endif

{-# INLINE monitoringIsEnabled #-}
monitoringIsEnabled :: Bool
#ifdef ACCELERATE_MONITORING
monitoringIsEnabled = True
{-# INLINE profilingIsEnabled #-}
profilingIsEnabled :: Bool
#ifdef ACCELERATE_PROFILE
profilingIsEnabled = True
#else
monitoringIsEnabled = False
profilingIsEnabled = False
#endif

{-# INLINE boundsChecksAreEnabled #-}
Expand Down
Loading

0 comments on commit b8a3f90

Please sign in to comment.