made parallel
some performance-data in main.hs. Changes can be viewed in git. TODO: - Find out why there is overhed of >50%
This commit is contained in:
		
							
								
								
									
										240
									
								
								src/Main.hs
									
									
									
									
									
								
							
							
						
						
									
										240
									
								
								src/Main.hs
									
									
									
									
									
								
							| @@ -42,7 +42,7 @@ import qualified Data.Stream                    as S | ||||
| import qualified Data.Text                      as T | ||||
| import           Data.Text.Encoding | ||||
| import qualified Data.Vector.Unboxed            as V | ||||
| import           Debug.Trace | ||||
| --import           Debug.Trace | ||||
| import           System.Environment | ||||
| import           System.Exit                    (exitFailure, exitSuccess) | ||||
| import           Test.QuickCheck.All            (quickCheckAll) | ||||
| @@ -120,20 +120,21 @@ emptyLine a | ||||
| doCalculation adj attr = | ||||
|         let | ||||
|                 dens = 0.75 | ||||
|                 omega = (A.fromListUnboxed (ix1 3) [0.5,0.5,0.5]) | ||||
|                 omega = (A.fromListUnboxed (ix1 6) [0,5,3,300,5,10]) | ||||
|                 delta = 2 | ||||
|                 (adj_, graph_) = preprocess adj attr {--0.8--} omega delta | ||||
|         in | ||||
|                 B.concat $ | ||||
|                         [ | ||||
|                                 outputArray $ trace ("After: "++ show (sumAllS adj_)++"\n") adj_, | ||||
|                                 --outputArray $ --trace ("After: "++ show (sumAllS adj_)++"\n")  | ||||
|                                 --              adj_, | ||||
|                                 outputGraph $ L.sort $ doAll graph_ adj_ attr dens omega delta | ||||
| --                                outputGraph $ L.sort $ (step graph_ adj attr dens omega delta) | ||||
| --                                                       ++ (step (step graph_ adj attr dens omega delta) adj attr dens omega delta) | ||||
|                         ] | ||||
|                 where | ||||
|                         -- don't print out seeds | ||||
|                         doAll [] _ _ _ _ _ = [] | ||||
|                         doAll gs a b c d e = doAll' (step gs a b c d e) a b c d e | ||||
|                         -- but everything in the following recursive calls | ||||
|                         doAll' [] _ _ _ _ _ = [] | ||||
|                         doAll' gs a b c d e = gs ++ doAll' (step gs a b c d e) a b c d e | ||||
|  | ||||
| @@ -172,7 +173,7 @@ checkError a | ||||
|  | ||||
| -- | convinience debug-function. Needs to be | ||||
| --   changed to return () to disable Debug. | ||||
| debug a = putStrLn a | ||||
| debug a = return () --putStrLn a | ||||
|  | ||||
|  | ||||
| -- | The main-function to bootstrap our application | ||||
| @@ -226,5 +227,230 @@ main = do | ||||
|  | ||||
|     ----- CALCULATE & OUTPUT | ||||
|  | ||||
|     debug $ "Before: " ++ show (sumAllS graph) | ||||
|     --debug $ "Before: " ++ show (sumAllS graph) | ||||
|     B.putStr $ doCalculation graph attr | ||||
|  | ||||
|      | ||||
|      | ||||
|      | ||||
| {---TIMINGS | ||||
|  | ||||
| SINGLE CORE | ||||
| =========== | ||||
|  ./hgraph +RTS -s >result | ||||
|  197,751,229,488 bytes allocated in the heap | ||||
|      290,034,880 bytes copied during GC | ||||
|       11,061,600 bytes maximum residency (10 sample(s)) | ||||
|        1,513,488 bytes maximum slop | ||||
|               33 MB total memory in use (0 MB lost due to fragmentation) | ||||
|  | ||||
|                                     Tot time (elapsed)  Avg pause  Max pause | ||||
|   Gen  0     381500 colls,     0 par    2.24s    2.20s     0.0000s    0.0005s | ||||
|   Gen  1        10 colls,     0 par    0.05s    0.05s     0.0054s    0.0154s | ||||
|  | ||||
|   TASKS: 3 (1 bound, 2 peak workers (2 total), using -N1) | ||||
|  | ||||
|   SPARKS: 6266 (0 converted, 0 overflowed, 0 dud, 15 GC'd, 6251 fizzled) | ||||
|  | ||||
|   INIT    time    0.00s  (  0.00s elapsed) | ||||
|   MUT     time   74.11s  ( 74.11s elapsed) | ||||
|   GC      time    2.30s  (  2.25s elapsed) | ||||
|   EXIT    time    0.00s  (  0.00s elapsed) | ||||
|   Total   time   76.42s  ( 76.36s elapsed) | ||||
|  | ||||
|   Alloc rate    2,668,183,845 bytes per MUT second | ||||
|  | ||||
|   Productivity  97.0% of total user, 97.1% of total elapsed | ||||
|  | ||||
| gc_alloc_block_sync: 0 | ||||
| whitehole_spin: 0 | ||||
| gen[0].sync: 0 | ||||
| gen[1].sync: 0 | ||||
|  | ||||
| 4 CORES: | ||||
| ======== | ||||
| ./hgraph +RTS -s -N4 >result | ||||
|  197,754,645,560 bytes allocated in the heap | ||||
|      293,083,624 bytes copied during GC | ||||
|       11,061,264 bytes maximum residency (10 sample(s)) | ||||
|        1,555,576 bytes maximum slop | ||||
|               34 MB total memory in use (0 MB lost due to fragmentation) | ||||
|  | ||||
|                                     Tot time (elapsed)  Avg pause  Max pause | ||||
|   Gen  0     380952 colls, 380952 par   15.25s    3.92s     0.0000s    0.0255s | ||||
|   Gen  1        10 colls,     9 par    0.22s    0.06s     0.0056s    0.0181s | ||||
|  | ||||
|   Parallel GC work balance: 1.68% (serial 0%, perfect 100%) | ||||
|  | ||||
|   TASKS: 6 (1 bound, 5 peak workers (5 total), using -N4) | ||||
|  | ||||
|   SPARKS: 6266 (6228 converted, 0 overflowed, 0 dud, 30 GC'd, 8 fizzled) | ||||
|  | ||||
|   INIT    time    0.00s  (  0.00s elapsed) | ||||
|   MUT     time  105.25s  ( 86.11s elapsed) | ||||
|   GC      time   15.47s  (  3.98s elapsed) | ||||
|   EXIT    time    0.00s  (  0.00s elapsed) | ||||
|   Total   time  120.73s  ( 90.09s elapsed) | ||||
|  | ||||
|   Alloc rate    1,878,861,647 bytes per MUT second | ||||
|  | ||||
|   Productivity  87.2% of total user, 116.8% of total elapsed | ||||
|  | ||||
| gc_alloc_block_sync: 661438 | ||||
| whitehole_spin: 0 | ||||
| gen[0].sync: 655 | ||||
| gen[1].sync: 1347 | ||||
|  | ||||
|  | ||||
| parallel preprocessing (Adj, Seeds) | ||||
| ================================== | ||||
| ./hgraph +RTS -s -N4 >result | ||||
| Building hgraph-0.0.1... | ||||
| Preprocessing executable 'hgraph' for hgraph-0.0.1... | ||||
| [4 of 5] Compiling DCB.DCB          ( src/DCB/DCB.hs, dist/build/hgraph/hgraph-tmp/DCB/DCB.o ) | ||||
| Linking dist/build/hgraph/hgraph ... | ||||
|  197,755,802,848 bytes allocated in the heap | ||||
|      289,986,840 bytes copied during GC | ||||
|       11,071,880 bytes maximum residency (10 sample(s)) | ||||
|        1,566,376 bytes maximum slop | ||||
|               34 MB total memory in use (0 MB lost due to fragmentation) | ||||
|  | ||||
|                                     Tot time (elapsed)  Avg pause  Max pause | ||||
|   Gen  0     380919 colls, 380919 par   15.73s    3.93s     0.0000s    0.0112s | ||||
|   Gen  1        10 colls,     9 par    0.28s    0.07s     0.0073s    0.0335s | ||||
|  | ||||
|   Parallel GC work balance: 1.69% (serial 0%, perfect 100%) | ||||
|  | ||||
|   TASKS: 6 (1 bound, 5 peak workers (5 total), using -N4) | ||||
|  | ||||
|   SPARKS: 7895 (7825 converted, 0 overflowed, 0 dud, 50 GC'd, 20 fizzled) | ||||
|  | ||||
|   INIT    time    0.00s  (  0.00s elapsed) | ||||
|   MUT     time   98.47s  ( 81.37s elapsed) | ||||
|   GC      time   16.01s  (  4.00s elapsed) | ||||
|   EXIT    time    0.00s  (  0.00s elapsed) | ||||
|   Total   time  114.49s  ( 85.37s elapsed) | ||||
|  | ||||
|   Alloc rate    2,008,240,220 bytes per MUT second | ||||
|  | ||||
|   Productivity  86.0% of total user, 115.3% of total elapsed | ||||
|  | ||||
| gc_alloc_block_sync: 757575 | ||||
| whitehole_spin: 0 | ||||
| gen[0].sync: 592 | ||||
| gen[1].sync: 510 | ||||
|  | ||||
| parallel processing (primitive, too many sparks fizzled) - Speedup: 76.36/51.51 = 1.48   | ||||
| ======================================================== | ||||
| ./hgraph +RTS -s -N4 >result | ||||
| Building hgraph-0.0.1... | ||||
| Preprocessing executable 'hgraph' for hgraph-0.0.1... | ||||
| [4 of 5] Compiling DCB.DCB          ( src/DCB/DCB.hs, dist/build/hgraph/hgraph-tmp/DCB/DCB.o ) | ||||
| [5 of 5] Compiling Main             ( src/Main.hs, dist/build/hgraph/hgraph-tmp/Main.o ) | ||||
| Linking dist/build/hgraph/hgraph ... | ||||
|  205,324,862,344 bytes allocated in the heap | ||||
|      224,224,264 bytes copied during GC | ||||
|       11,157,008 bytes maximum residency (9 sample(s)) | ||||
|        1,559,568 bytes maximum slop | ||||
|               35 MB total memory in use (0 MB lost due to fragmentation) | ||||
|  | ||||
|                                     Tot time (elapsed)  Avg pause  Max pause | ||||
|   Gen  0     123063 colls, 123063 par    6.77s    1.67s     0.0000s    0.0074s | ||||
|   Gen  1         9 colls,     8 par    0.21s    0.06s     0.0061s    0.0190s | ||||
|  | ||||
|   Parallel GC work balance: 8.15% (serial 0%, perfect 100%) | ||||
|  | ||||
|   TASKS: 6 (1 bound, 5 peak workers (5 total), using -N4) | ||||
|  | ||||
|   SPARKS: 1714681 (861196 converted, 0 overflowed, 0 dud, 78 GC'd, 853407 fizzled) | ||||
|  | ||||
|   INIT    time    0.00s  (  0.00s elapsed) | ||||
|   MUT     time  145.46s  ( 49.78s elapsed) | ||||
|   GC      time    6.99s  (  1.73s elapsed) | ||||
|   EXIT    time    0.00s  (  0.00s elapsed) | ||||
|   Total   time  152.45s  ( 51.51s elapsed) | ||||
|  | ||||
|   Alloc rate    1,411,565,587 bytes per MUT second | ||||
|  | ||||
|   Productivity  95.4% of total user, 282.4% of total elapsed | ||||
|  | ||||
| gc_alloc_block_sync: 378641 | ||||
| whitehole_spin: 0 | ||||
| gen[0].sync: 572 | ||||
| gen[1].sync: 609 | ||||
|  | ||||
|  | ||||
| parallel processing (monad-par, repa-stuff seqential) - Speedup: 76.36/34,05 = 2.243 | ||||
| ===================================================== | ||||
|  | ||||
| ./hgraph +RTS -N4 -s > result.txt | ||||
|  204,368,634,080 bytes allocated in the heap | ||||
|      306,058,720 bytes copied during GC | ||||
|       11,108,872 bytes maximum residency (10 sample(s)) | ||||
|        1,597,088 bytes maximum slop | ||||
|               35 MB total memory in use (0 MB lost due to fragmentation) | ||||
|  | ||||
|                                     Tot time (elapsed)  Avg pause  Max pause | ||||
|   Gen  0     108838 colls, 108838 par    9.21s    2.29s     0.0000s    0.0020s | ||||
|   Gen  1        10 colls,     9 par    0.32s    0.08s     0.0083s    0.0245s | ||||
|  | ||||
|   Parallel GC work balance: 29.41% (serial 0%, perfect 100%) | ||||
|  | ||||
|   TASKS: 6 (1 bound, 5 peak workers (5 total), using -N4) | ||||
|  | ||||
|   SPARKS: 15737 (14412 converted, 0 overflowed, 0 dud, 1251 GC'd, 74 fizzled) | ||||
|  | ||||
|   INIT    time    0.00s  (  0.00s elapsed) | ||||
|   MUT     time  124.37s  ( 31.67s elapsed) | ||||
|   GC      time    9.53s  (  2.37s elapsed) | ||||
|   EXIT    time    0.00s  (  0.00s elapsed) | ||||
|   Total   time  133.91s  ( 34.05s elapsed) | ||||
|  | ||||
|   Alloc rate    1,643,242,747 bytes per MUT second | ||||
|  | ||||
|   Productivity  92.9% of total user, 365.3% of total elapsed | ||||
|  | ||||
| gc_alloc_block_sync: 531144 | ||||
| whitehole_spin: 0 | ||||
| gen[0].sync: 758 | ||||
| gen[1].sync: 17 | ||||
|          | ||||
|  | ||||
| ADDITIONAL OVERHEAD (running on 1 Core with parallel stuff): Slowdown: 76.36/123.39 = 0.62885 | ||||
| ============================================================ | ||||
|  | ||||
| ./hgraph +RTS -N1 -s > result.txt | ||||
|  204,364,490,096 bytes allocated in the heap | ||||
|      291,824,120 bytes copied during GC | ||||
|       10,081,664 bytes maximum residency (11 sample(s)) | ||||
|        1,545,536 bytes maximum slop | ||||
|               30 MB total memory in use (0 MB lost due to fragmentation) | ||||
|  | ||||
|                                     Tot time (elapsed)  Avg pause  Max pause | ||||
|   Gen  0     393965 colls,     0 par    6.77s    6.72s     0.0000s    0.0017s | ||||
|   Gen  1        11 colls,     0 par    0.09s    0.09s     0.0079s    0.0217s | ||||
|  | ||||
|   TASKS: 3 (1 bound, 2 peak workers (2 total), using -N1) | ||||
|  | ||||
|   SPARKS: 15737 (0 converted, 0 overflowed, 0 dud, 166 GC'd, 15571 fizzled) | ||||
|  | ||||
|   INIT    time    0.00s  (  0.00s elapsed) | ||||
|   MUT     time  116.53s  (116.52s elapsed) | ||||
|   GC      time    6.85s  (  6.80s elapsed) | ||||
|   EXIT    time    0.00s  (  0.00s elapsed) | ||||
|   Total   time  123.39s  (123.32s elapsed) | ||||
|  | ||||
|   Alloc rate    1,753,707,727 bytes per MUT second | ||||
|  | ||||
|   Productivity  94.4% of total user, 94.5% of total elapsed | ||||
|  | ||||
| gc_alloc_block_sync: 0 | ||||
| whitehole_spin: 0 | ||||
| gen[0].sync: 0 | ||||
| gen[1].sync: 0 | ||||
|  | ||||
|  | ||||
| REAL SPEEDUP AGAINST OVERHEAD-VARIANT: 129.39/34.05 = 3.8 | ||||
| ========================================================= | ||||
|  | ||||
| -} | ||||
		Reference in New Issue
	
	Block a user