lectures/lecture4/Parser0.hs

-- | The naive shallow embedding following the semantics, using lists
--   to implement the multisets.

module Parser0
  ( P           -- :: * -> * -> *
  , symbol      -- :: P s s
  , pfail       -- :: P s a
  , (+++)       -- :: P s a -> P s a -> P s a
  , ParseResult -- ParseResult s a = [(a, [s])]
  , parse       -- :: P s a -> [s] -> ParseResult s a
  ) where

{-

  The semantics of a parser of type 'P s a' is a function from a
  string of 's' to a multiset of results paired with the remaining
  parts of the input string. We use a multiset capture the fact that
  we don't care about the order of the results.

  The semantic function [| _ |] is defined as follows (we use {| |} do denote
  multisets and \/ for multiset union).

  [| _ |] :: P s a -> [s] -> {| (a, [s]) |}
  [| symbol   |] (c : s) = {| (c, s) |}
  [| symbol   |] []      = {| |}
  [| pfail    |] s       = {| |}
  [| p +++ q  |] s       = [| p |] s  \/  [| q |] s
  [| return x |] s       = {| (x, s) |}
  [| p >>= f  |] s       = {| (y, s'') | (x, s')  <- [| p   |] s
                                       , (y, s'') <- [| f x |] s
                           |}

  Using this semantics we can prove (exercise) a number of useful laws about
  parsers. We will use these laws later to derive an efficient implementation
  of the library.

  For two parsers p and q we define
  
    p == q  iff  forall s. [| p |] s == [| q |] s

  Monad Laws

    L1.  return x >>= f   ==  f x
    L2.  p >>= return     ==  p
    L3.  (p >>= f) >>= g  ==  p >>= (\x -> f x >>= g)

  More laws about >>=

    L4.  fail >>= f       ==  fail
    L5.  (p +++ q) >>= f  ==  (p >>= f) +++ (q >>= f)

  Laws about (+++) and fail

    L6.  fail +++ q       ==  q
    L7.  p +++ fail       ==  p

  Laws about (+++)

    L8.  (p +++ q) +++ r  ==  p +++ (q +++ r)
    L9.  p +++ q          ==  q +++ p           -- multisets are important here!

  Laws about symbol

    L10. (symbol >>= f) +++ (symbol >>= g)
                          ==  symbol >>= (\c -> f c +++ g c)

  Here is the proof of L10 for the case of a non-empty input string:

    [| (symbol >>= f) +++ (symbol >>= g) |] (c:s)         ==  { semantics of (+++) }
    [| symbol >>= f |] (c:s) \/ [| symbol >>= g |] (c:s)  ==  { semantics of >>= and symbol }
    [| f c |] s \/ [| g c |] s                            ==  { semantics of (+++) }
    [| f c +++ g c |] s                                   ==  { semantics of symbol and >>= }
    [| symbol >>= (\c -> f c +++ g c) |] (c:s)

  We can make a shallow embedding following the semantics, using lists for the
  multisets.

-}

type ParseResult s a = [(a, [s])]

-- | We implement parsers by their semantics.
newtype P s a = P ([s] -> ParseResult s a)

-- | The 'parse' function is trivial.
parse :: P s a -> [s] -> ParseResult s a
parse (P p) s = p s

-- The operations simply follow the semantics inserting the newtype constructor
-- where appropriate.

symbol :: P s s
symbol = P $ \s -> case s of
  (c : s) -> [(c, s)]
  []      -> []

pfail :: P s a
pfail = P $ \_ -> []

(+++) :: P s a -> P s a -> P s a
P p +++ P q = P $ \s -> p s ++ q s

instance Monad (P s) where
  return x = P $ \s -> [(x, s)]
  p >>= f  = P $ \s -> [ (y, s'') | (x, s')  <- parse p s
                                  , (y, s'') <- parse (f x) s'
                       ]
  fail _ = pfail


-- There are a number of efficiency problems with this implementation. To solve
-- them we'll start with a naive deep embedding and successively refine it to
-- get something efficient.