DZone Snippets is a public source code repository. Easily build up your personal collection of code snippets, categorize them with tags / keywords, and share them with the world

Snippets has posted 5883 posts at DZone. View Full User Profile

Haskell Regular Expression Matcher

08.19.2007
| 4759 views |
  • submit to reddit
        Basic implementation of Regular Expressions based on "Derivatives of Regular Expressions" by Janusz A. Brzozowski (Journal of Association for Computing Machinery, October 1964)

Not really intended for serious use. Just a proof of concept.

module Regexp
where

import Data.Set (Set)
import Data.Map (Map)
import Monad
import List
import Maybe
import qualified Data.Set as Set
import qualified Data.Map as Map

data Regexp = 
    Zero
  | Match Char       -- matches a single char
  | Not Regexp       -- matches the negation of its argument
  | Prod [Regexp]    -- matches a concatentation of its arguments
  | Sum (Set Regexp) -- matches either of its arguments
  | Star Regexp      -- matches repetitions of its argument (including 0 repetitions)
  deriving (Eq, Ord)

instance Show Regexp where
  show Zero = "0"
  show (Match c) = [c]
  show (Not x)   = '~' : show x
  show (Prod x) = join . (map show)  $ x
  show (Sum x) = "(" ++ ( join . intersperse "|" . (map show) . Set.toList $ x ) ++ ")"
  show (Star x) = "(" ++ show x ++ ")*" 

-- Flagrant abuse of type classes to allow implicit conversion of datatypes into regular
-- expressions.
class Match a where
  match :: a -> Regexp

instance Match Char where
  match c = Match c

instance (Match a) => Match [a] where
  match = con

instance Match Regexp where
  match = id

-- "smart" versions of the constructors, which perform normalisation of the datatype.
-- As long as all regular expressions are built up using these and the match instance
-- for char we can guarantee that structural equality of terms == similarity.
-- This is important to make sure we only generate a finite number of states.
zero :: Regexp 
zero = Zero 

one :: Regexp
one = Prod []

(<+>) :: (Match a, Match b) => a -> b -> Regexp
x <+> y = 
  case (match x, match y) of
    (Zero, b)      -> b
    (a, Zero)      -> a
    (Sum a, Sum b) -> Sum (Set.union a b)
    (Sum a, b)     -> Sum (Set.insert b a)
    (a, Sum b)     -> Sum (Set.insert a b)    
    (a, b)         -> Sum $ Set.fromList [a, b]
  
oneOf :: (Match a) => [a] -> Regexp
oneOf = foldr (<+>) zero

(<*>) :: (Match a, Match b) => a -> b -> Regexp
u <*> v = 
  case (match u, match v) of 
    (Zero, _)         -> zero
    (_, Zero)         -> zero
    (Prod x, Prod y)  -> Prod (x ++ y)
    (Prod x, y)       -> Prod (x ++ [y])
    (x, Prod y)       -> Prod (x : y)
    (x, y)            -> Prod [x, y]

con :: (Match a) => [a] -> Regexp
con = foldr (<*>) one

neg :: (Match a) => a -> Regexp
neg x = 
  case (match x) of
  (Not y) -> y
  y       -> Not y

star :: (Match a) => a -> Regexp
star x =
  case (match x) of
    (Zero)   -> Zero
    (Star y) -> Star y
    y        -> Star y

-- Returns if the regex matches the empty string.
del :: Regexp -> Bool
del (Zero)    = False
del (Sum x)   = or . map del $ Set.toList x
del (Prod x)  = and . map del $ x
del (Match _) = False
del (Not x)   = not $ del x;
del (Star _)  = True

-- The derivative of a regular language A with respect to a character
-- c is dA/dc = { s : cs \in A } 
diff :: Char -> Regexp -> Regexp
diff _ (Zero)  = zero
diff c (Match d) | (c == d) = one
diff c (Match d) = zero
diff c (Sum x) = oneOf $ (map $ diff c) (Set.toList x)
diff c (Prod []) = zero
diff c (Prod (x:xs)) | del x = (diff c x <*> xs) <+> diff c (Prod xs)
diff c (Prod (x:xs)) = diff c x <*> xs
diff c (Not x) = Not (diff c x)
diff c (Star x) = diff c x <*> Star x

flattenSet :: (Ord a) => Set (Set a) -> Set a
flattenSet = Set.fold Set.union Set.empty

(/>>=) :: (Ord a, Ord b) => Set a -> (a -> Set b) -> Set b
x />>= f = flattenSet (Set.map f x)

-- The alphabet of all characters that appear in this regexp
alphabet :: Regexp -> Set Char
alphabet (Zero) = Set.empty
alphabet (Sum x) = flattenSet (Set.map alphabet x) 
alphabet (Prod x) = Set.unions $ map alphabet x
alphabet (Not x) = alphabet x
alphabet (Star x) = alphabet x
alphabet (Match c) = Set.singleton c

-- Set of all derivatives of a regular expression (including itself, and higher order derivatives).
derivatives :: Regexp -> [Regexp]
derivatives exp = Set.toList $ enlarge (Set.singleton exp) (Set.singleton exp) 
  where
    alpha = alphabet exp
    firstDerivatives x = Set.map (`diff` x) alpha 
    enlarge :: Set Regexp -> Set Regexp -> Set Regexp
    enlarge new found = 
      if Set.null new
        then found
        else
          let nextNew   = (new />>= firstDerivatives) Set.\\ found
              nextFound = found `Set.union` nextNew
          in enlarge nextNew nextFound

-- A simple finite state machine type 
data FSM = State { transitions :: (Map Char FSM), isFinal :: Bool } 

-- Converts a Regexp into a finite state machine by using the derivatives
-- with respect to specific characters as the transitions. Essentially at 
-- each stage we build up a regular expression that the remaining characters
-- have to match. Due to Cunning Mathematics, only finitely many such regular
-- expressions (up to similarity) result.
compile :: Regexp -> FSM
compile x = fromJust $ Map.lookup x states
  where
    states :: Map Regexp FSM
    states = Map.fromList $
      do re <- derivatives x -- Totally gratuitious use of list monad. :)
         let trans = do c <- Set.toList $ alphabet re
                        let d = diff c re
                        return (c, fromJust $ Map.lookup d states)
         let state = State (Map.fromList trans) (del re) 
         return (re, state) 

runFSM :: FSM -> String -> Bool
runFSM x []     = isFinal x
runFSM x (c:cs) = case (Map.lookup c $ transitions x) of
                    Nothing -> False
                    Just y  -> runFSM y cs
               
matches :: (Match a) => String -> a -> Bool
matches cs exp = runFSM (compile $  match exp) cs