DZone Snippets is a public source code repository. Easily build up your personal collection of code snippets, categorize them with tags / keywords, and share them with the world

# Haskell Regular Expression Matcher

Basic implementation of Regular Expressions based on "Derivatives of Regular Expressions" by Janusz A. Brzozowski (Journal of Association for Computing Machinery, October 1964) Not really intended for serious use. Just a proof of concept.

module Regexp where import Data.Set (Set) import Data.Map (Map) import Monad import List import Maybe import qualified Data.Set as Set import qualified Data.Map as Map data Regexp = Zero | Match Char -- matches a single char | Not Regexp -- matches the negation of its argument | Prod [Regexp] -- matches a concatentation of its arguments | Sum (Set Regexp) -- matches either of its arguments | Star Regexp -- matches repetitions of its argument (including 0 repetitions) deriving (Eq, Ord) instance Show Regexp where show Zero = "0" show (Match c) = [c] show (Not x) = '~' : show x show (Prod x) = join . (map show) $ x show (Sum x) = "(" ++ ( join . intersperse "|" . (map show) . Set.toList $ x ) ++ ")" show (Star x) = "(" ++ show x ++ ")*" -- Flagrant abuse of type classes to allow implicit conversion of datatypes into regular -- expressions. class Match a where match :: a -> Regexp instance Match Char where match c = Match c instance (Match a) => Match [a] where match = con instance Match Regexp where match = id -- "smart" versions of the constructors, which perform normalisation of the datatype. -- As long as all regular expressions are built up using these and the match instance -- for char we can guarantee that structural equality of terms == similarity. -- This is important to make sure we only generate a finite number of states. zero :: Regexp zero = Zero one :: Regexp one = Prod [] (<+>) :: (Match a, Match b) => a -> b -> Regexp x <+> y = case (match x, match y) of (Zero, b) -> b (a, Zero) -> a (Sum a, Sum b) -> Sum (Set.union a b) (Sum a, b) -> Sum (Set.insert b a) (a, Sum b) -> Sum (Set.insert a b) (a, b) -> Sum $ Set.fromList [a, b] oneOf :: (Match a) => [a] -> Regexp oneOf = foldr (<+>) zero (<*>) :: (Match a, Match b) => a -> b -> Regexp u <*> v = case (match u, match v) of (Zero, _) -> zero (_, Zero) -> zero (Prod x, Prod y) -> Prod (x ++ y) (Prod x, y) -> Prod (x ++ [y]) (x, Prod y) -> Prod (x : y) (x, y) -> Prod [x, y] con :: (Match a) => [a] -> Regexp con = foldr (<*>) one neg :: (Match a) => a -> Regexp neg x = case (match x) of (Not y) -> y y -> Not y star :: (Match a) => a -> Regexp star x = case (match x) of (Zero) -> Zero (Star y) -> Star y y -> Star y -- Returns if the regex matches the empty string. del :: Regexp -> Bool del (Zero) = False del (Sum x) = or . map del $ Set.toList x del (Prod x) = and . map del $ x del (Match _) = False del (Not x) = not $ del x; del (Star _) = True -- The derivative of a regular language A with respect to a character -- c is dA/dc = { s : cs \in A } diff :: Char -> Regexp -> Regexp diff _ (Zero) = zero diff c (Match d) | (c == d) = one diff c (Match d) = zero diff c (Sum x) = oneOf $ (map $ diff c) (Set.toList x) diff c (Prod []) = zero diff c (Prod (x:xs)) | del x = (diff c x <*> xs) <+> diff c (Prod xs) diff c (Prod (x:xs)) = diff c x <*> xs diff c (Not x) = Not (diff c x) diff c (Star x) = diff c x <*> Star x flattenSet :: (Ord a) => Set (Set a) -> Set a flattenSet = Set.fold Set.union Set.empty (/>>=) :: (Ord a, Ord b) => Set a -> (a -> Set b) -> Set b x />>= f = flattenSet (Set.map f x) -- The alphabet of all characters that appear in this regexp alphabet :: Regexp -> Set Char alphabet (Zero) = Set.empty alphabet (Sum x) = flattenSet (Set.map alphabet x) alphabet (Prod x) = Set.unions $ map alphabet x alphabet (Not x) = alphabet x alphabet (Star x) = alphabet x alphabet (Match c) = Set.singleton c -- Set of all derivatives of a regular expression (including itself, and higher order derivatives). derivatives :: Regexp -> [Regexp] derivatives exp = Set.toList $ enlarge (Set.singleton exp) (Set.singleton exp) where alpha = alphabet exp firstDerivatives x = Set.map (`diff` x) alpha enlarge :: Set Regexp -> Set Regexp -> Set Regexp enlarge new found = if Set.null new then found else let nextNew = (new />>= firstDerivatives) Set.\\ found nextFound = found `Set.union` nextNew in enlarge nextNew nextFound -- A simple finite state machine type data FSM = State { transitions :: (Map Char FSM), isFinal :: Bool } -- Converts a Regexp into a finite state machine by using the derivatives -- with respect to specific characters as the transitions. Essentially at -- each stage we build up a regular expression that the remaining characters -- have to match. Due to Cunning Mathematics, only finitely many such regular -- expressions (up to similarity) result. compile :: Regexp -> FSM compile x = fromJust $ Map.lookup x states where states :: Map Regexp FSM states = Map.fromList $ do re <- derivatives x -- Totally gratuitious use of list monad. :) let trans = do c <- Set.toList $ alphabet re let d = diff c re return (c, fromJust $ Map.lookup d states) let state = State (Map.fromList trans) (del re) return (re, state) runFSM :: FSM -> String -> Bool runFSM x [] = isFinal x runFSM x (c:cs) = case (Map.lookup c $ transitions x) of Nothing -> False Just y -> runFSM y cs matches :: (Match a) => String -> a -> Bool matches cs exp = runFSM (compile $ match exp) cs