lectures/lecture4/Parsers.hs

{-# LANGUAGE GADTs #-}
module Parsers
  ( P        
  , symbol   
  , pfail    
  , (+++)    
  , Semantics
  , parse       
  ) where


-- | Naive deep embedding: each operation is implemented as a
-- constructor.
type P s a = Parser1 s a
symbol :: P s s
symbol = Symbol

pfail :: P s a
pfail  = Fail

(+++) :: P s a -> P s a -> P s a
(+++)  = (:+++)

instance Monad (Parser1 s) where
  return = Return
  (>>=)  = (:>>=)

data Parser1 s a where
  Symbol :: Parser1 s s
  Fail   :: Parser1 s a
  (:+++) :: Parser1 s a -> Parser1 s a -> Parser1 s a
  Return :: a -> Parser1 s a
  (:>>=) :: Parser1 s a -> (a -> Parser1 s b) -> Parser1 s b

-- Final semantics to expose:
type Semantics s a = [s] -> [(a,[s])]

-- | Reference implementation/Semantics.  (It's easy to see that it's what we
-- want, but inefficient)
run :: Parser1 s a -> Semantics s a
run Symbol     (c : s) = [(c, s)]
run Symbol     []      = []
run Fail       _       = []
run (p :+++ q) s       = run p s ++ run q s
run (Return x) s       = [(x, s)]
run (p :>>= f) s       = [(y, s'') | (x, s')  <- run p s
                                   , (y, s'') <- run (f x) s']

{-
  Using this reference semantics we can prove (exercise) a number of useful laws about
  parsers. We will use these laws later to derive an efficient implementation
  of the library.

  Notation: [| p |] = run p

  For two parsers p and q we define
  
    p == q  iff  ∀ s. [| p |] s == [| q |] s, up to the order of elements in the result 
                                              (list is interpreted as a multiset).

  Monad Laws

    L1.  return x >>= f   ==  f x
    L2.  p >>= return     ==  p
    L3.  (p >>= f) >>= g  ==  p >>= (\x -> f x >>= g)

  More laws about >>=, (+++) and fail

    L4.  fail >>= f       ==  fail
    L5.  (p +++ q) >>= f  ==  (p >>= f) +++ (q >>= f)

  Laws about (+++) and fail

    L6.  fail +++ q       ==  q
    L7.  p +++ fail       ==  p

  Laws about (+++)

    L8.  (p +++ q) +++ r  ==  p +++ (q +++ r)
    L9.  p +++ q          ==  q +++ p           -- multisets are important here!

  Laws about >>=, (+++) and symbol
  
    L10. (symbol >>= f) +++ (symbol >>= g)
                          ==  symbol >>= (\c -> f c +++ g c)

  Here is the proof of L10 for the case of a non-empty input string:

       [| (symbol >>= f) +++ (symbol >>= g) |] (c:s)         

 ==  { semantics of (+++) }

       [| symbol >>= f |] (c:s) ++ [| symbol >>= g |] (c:s)  

 ==  { semantics of >>= and symbol }

       [| f c |] s ++ [| g c |] s                   

  ==  { semantics of (+++) }

       [| f c +++ g c |] s                                  

  ==  { semantics of symbol and >>= }

      [| symbol >>= (\x -> f x +++ g x) |] (c:s)

  Exercise: prove or test the laws
-}

{- The reference semantics is useful for reasoning, but inefficient.
   There are three sources of inefficiency that we can identify:

   1. The list comprehension builds a lot of intermediate lists which might be
      costly.

   2. List append (++) is linear in its first argument which means that left
      nested applications of (+++) get a quadratic behaviour.

   3. (+++) is treated in a depth first way, first computing the results of the
      left parser, then computing the results of the second parser. This leads
      to a space leak since we have to hang on to the input string to feed to
      the second parser, while traversing the string with the first parser.
-}

-- To solve them we'll invent clever intermediate representations.



-- Can we linearize sequencing (>>=)? (Would help with 1.)
data Parser2 s a where
    SymbolBind2 :: (s -> Parser2 s a) -> (Parser2 s a) -- SymbolBind f  ≜  Symbol >>= f
    Return2 :: a -> Parser2 s a
    (::+++) :: Parser2 s a -> Parser2 s a -> Parser2 s a
    Fail2 :: Parser2 s a


runParser2 :: Parser2 s a -> Semantics s a
runParser2 (SymbolBind2 y) [] = []
runParser2 (SymbolBind2 y) (x : xs) = runParser2 (y x) xs -- ~= run (Symbol >>= f) (x:xs)
runParser2 (Return2 y) l = [ (y , l) ] 
runParser2 (y ::+++ y') l = runParser2 y l ++ runParser2 y' l
runParser2 Fail2 l = []


-- But it turns out that we can translate Parser1 into Parser2!
p12 :: Parser1 s a -> Parser2 s a
p12 Symbol = SymbolBind2 Return2 -- L1
p12 Fail = Fail2
p12 (y :+++ q) = p12 y ::+++ p12 q
p12 (Return y) = Return2 y 
p12 (Symbol :>>= q) = SymbolBind2 (\c -> p12 (q c)) -- def of SymbolBind
p12 (Fail :>>= q) = Fail2 -- Parser law. L4.
p12 ((y :+++ q) :>>= y0) = p12 (y :>>= y0) ::+++ p12 (q :>>= y0) -- Parser law. L5
p12 (Return y :>>= q) = p12 (q y) -- monad law, L1
p12 ((p :>>= k') :>>= k) = p12 (p :>>= (\x -> k' x :>>= k)) -- monad law, L3

-- Can we linearize choice as well (+++)?
data Parser3 s a where
    SymbolBind3 :: (s -> Parser3 s a) -> Parser3 s a
    ReturnChoice3 :: a -> Parser3 s a -> Parser3 s a 
    -- ReturnChoice x p  ≜  Return x +++ p
    Fail3 :: Parser3 s a

runParser3 :: Parser3 s a -> Semantics s a
runParser3 (SymbolBind3 y) [] = []
runParser3 (SymbolBind3 y) (x : xs) = runParser3 (y x) xs
runParser3 (ReturnChoice3 y q) l = (y , l) : runParser3 q l 
                             -- ~= run (Return x +++ p)
runParser3 Fail3 l = []

-- But it turns out that we can translate 2 into 3!
p23 :: Parser2 s a -> Parser3 s a
p23 (SymbolBind2 y) = SymbolBind3 (\s -> p23 (y s))
p23 (Return2 y) = ReturnChoice3 y Fail3 -- def. of returnchoice
p23 (p ::+++ q) = best (p23 p) (p23 q)
p23 Fail2 = Fail3

best :: Parser3 s a -> Parser3 s a -> Parser3 s a
best (SymbolBind3 y) (SymbolBind3 q) = SymbolBind3 (\s -> best (y s) (q s)) -- L10
best p (ReturnChoice3 x q) = ReturnChoice3 x (best p q) -- L8 (+++ commut)
best (ReturnChoice3 x q) p = ReturnChoice3 x (best p q) -- L9 (+++ assoc)
best p Fail3 = p -- L6
best Fail3 q = q -- L7


-- | Efficient implementation for general syntax:
parse :: P s a -> Semantics s a
parse = runParser3 . p23 . p12

-- we could show formally: 
-- (x , s) ∈ run        p ss  <=>  (x , s) ∈ runParser2 (1to2 p) ss
-- (x , s) ∈ runParser2 p ss  <=>  (x , s) ∈ runParser3 (2to3 p) ss

-- and therefore:
-- (x , s) ∈ run p ss <-> (x , s) ∈ parse p ss

-- Exercise: prove or test


{----------------------
 NOTES:

* L4 to L10 are "parser laws", expected to hold of any well-behaved
parser.


-}