module ObsLab where

import Obsidian
import Obsidian.CodeGen.CUDA
import Obsidian.Run.CUDA.Exec 

import Prelude hiding (zipWith)
import qualified Prelude as P
import Control.Monad.State

import qualified Data.Vector.Storable as V 
import Data.Word 


----------------------------------------------------------------------
-- TASK 3 Vector Addition 
----------------------------------------------------------------------

------------------------------------------------------------
-- vadd block-local computation
-- - Look at the "inc" example from the lecture slides 
------------------------------------------------------------

vaddLocal :: SPull EFloat
          -> SPull EFloat
          -> SPull EFloat
vaddLocal = undefined 

------------------------------------------------------------
-- Distribute and replicate the vadd computation over blocks
-- - Split the array into parts of 128 elements each
-- - Look at the "inc" example from the lecture slides 
------------------------------------------------------------

vadd :: DPull EFloat -> DPull EFloat -> DPush Grid EFloat
vadd a b = asGrid $ undefined 
 

------------------------------------------------------------
-- Launch the vadd computation on the GPU
------------------------------------------------------------
    
launchVadd :: IO () 
launchVadd =
  withCUDA $
  do
    -- capture: compiles the Obsidian code all the
    --          way to GPU executable format.
    --  - The parameter "128" indicates the number
    --    of threads to generate code for.
    --    In this case 64 "real" CUDA threads are used
    --    to perform the 128 additions in vadd.
    --    Experiment with this number. 
    kern <- capture 64 vadd

    -- Generate input data and allocate arrays in GPU DRAM
    -- This is an example. For timing larger arrays
    -- and more blocks should be used. 
    useVector (V.fromList [0..255]) $ \a ->
      useVector (V.fromList (P.reverse [0..255])) $ \b ->
        withVector 256 $ \o ->
        do
          fill o 0

          -- Launch 2 Blocks computing "kern" with
          -- arrays a and b as input.
          -- Put output in o. 
          o <== (2,kern) <> a <> b

          r <- copyOut o
          lift $ putStrLn $ show r


----------------------------------------------------------------------
-- TASK 4 (Reduction) 
----------------------------------------------------------------------

------------------------------------------------------------
-- Reduction (sum or generalized) 
------------------------------------------------------------

sumLocal :: SPull EFloat
         -> Program Block (SPush Block EFloat) 
sumLocal arr
  | len arr == 1 = undefined
  | otherwise = undefined 

-- alternative
       
sumLocal' :: SPull EFloat
          -> SPush Block EFloat 
sumLocal' arr = execBlock $ body arr
  where body arr 
          | len arr == 1 = undefined 
          | otherwise = undefined 


------------------------------------------------------------
-- Perform many parallel reductions. One per block 
------------------------------------------------------------
sums :: DPull EFloat -> DPush Grid EFloat
sums arr = asGrid $ undefined

-- alternative 

sums' :: DPull EFloat -> DPush Grid EFloat
sums' arr = asGrid $ undefined 


------------------------------------------------------------
-- Launch the sums computation on the GPU
------------------------------------------------------------

launchSums :: IO () 
launchSums =
  withCUDA $
  do
    kern <- capture 64 sums 

    -- generate input data and allocate arrays in GPU DRAM
    useVector (V.fromList [0..255]) $ \a ->
        withVector 2 $ \o ->
        do
          fill o 0

          -- Launch 2 Blocks computing "kern" with
          -- arrays a and b as input.
          -- Put output in o. 
          o <== (2,kern) <> a 

          r <- copyOut o
          lift $ putStrLn $ show r

    
----------------------------------------------------------------------
-- TASK 5 (Dot Product) 
----------------------------------------------------------------------
  
  -- This task combines a reduction and an element wise operation. 
  -- The reduction needed is sum, but the elementwise operation
  -- we need is vector "products".

prodLocal :: SPull EFloat
          -> SPull EFloat
          -> SPull EFloat
prodLocal = undefined 

products :: DPull EFloat -> DPull EFloat -> DPush Grid EFloat
products a b = asGrid $ undefined 

------------------------------------------------------------
-- perform many dot products in parallel, one per block. 
------------------------------------------------------------

-- One alternative (1), create a dotProds kernel.
-- This should combine prodLocal and sumLocal. 
dotProds :: DPull EFloat -> DPull EFloat -> DPush Grid EFloat
dotProds a1 a2 = asGrid $ unefined 
  where
    body :: SPull EFloat -> SPull EFloat -> SPush Block EFloat
    body a b = undefined 
    -- (hint) -- body a b = execBlock $ do ... 
     
-- Another alternative (2) is to run the "products" and "sums" kernels
-- one after another as separate kernel launches. 

------------------------------------------------------------
-- Launch Alternative 2
------------------------------------------------------------

launchAlt2 :: IO () 
launchAlt2 =
  withCUDA $
  do
    prod_k <- capture 64 products
    sums_k <- capture 64 sums 

    -- generate input data and allocate arrays in GPU DRAM
    useVector (V.fromList [0..255]) $ \a ->
      useVector (V.fromList (P.reverse [0..255])) $ \b ->
        withVector 256 $ \tmp ->  
          withVector 2 $ \o ->
          do
            fill o 0
            
            tmp <== (2,prod_k) <> a <> b
            o   <== (2,sums_k) <> tmp 

            r <- copyOut o
            lift $ putStrLn $ show r

------------------------------------------------------------
-- Launch Alternative 1
------------------------------------------------------------

launchAlt1 :: IO () 
launchAlt1 =
  withCUDA $
  do
    dotp_k <- capture 64 dotProds
    
    -- generate input data and allocate arrays in GPU DRAM
    useVector (V.fromList [0..255]) $ \a ->
      useVector (V.fromList (P.reverse [0..255])) $ \b ->
        withVector 2 $ \o ->
        do
          fill o 0
            
          o   <== (2,dotp_k) <> a <> b 

          r <- copyOut o
          lift $ putStrLn $ show r


----------------------------------------------------------------------
-- Task 6 
----------------------------------------------------------------------

 -- This task is very open.
 -- * Build on one of the previous tasks by parameterizing
 --   and generalizing.
 -- * Implement anything. There are ideas in the LAB description. 
  

----------------------------------------------------------------------
-- Main (example) 
----------------------------------------------------------------------
          
main = launchVadd