-- | This "Text.Regex.TDFA.Pattern" module provides the 'Pattern' data
-- type and its subtypes. This 'Pattern' type is used to represent
-- the parsed form of a Regular Expression.
module Text.Regex.TDFA.Pattern
(Pattern(..)
,PatternSet(..)
,PatternSetCharacterClass(..)
,PatternSetCollatingElement(..)
,PatternSetEquivalenceClass(..)
,GroupIndex
,DoPa(..)
,showPattern
-- ** Internal use
,starTrans
-- ** Internal use, Operations to support debugging under ghci
,starTrans',simplify',dfsPattern
) where
{- By Chris Kuklewicz, 2007. BSD License, see the LICENSE file. -}
import Data.List(intersperse,partition)
import qualified Data.Set as Set(toAscList,toList)
import Data.Set(Set) -- XXX EnumSet
import Text.Regex.TDFA.Common(DoPa(..),GroupIndex,common_error)
err :: String -> a
err = common_error "Text.Regex.TDFA.Pattern"
-- | Pattern is the type returned by the regular expression parser.
-- This is consumed by the CorePattern module and the tender leaves
-- are nibbled by the TNFA module.
data Pattern = PEmpty
| PGroup (Maybe GroupIndex) Pattern -- Nothing to indicate non-matching PGroup (Nothing never used!)
| POr [Pattern] -- flattened by starTrans
| PConcat [Pattern] -- flattened by starTrans
| PQuest Pattern -- eliminated by starTrans
| PPlus Pattern -- eliminated by starTrans
| PStar Bool Pattern -- True means mayFirstBeNull is True
| PBound Int (Maybe Int) Pattern -- eliminated by starTrans
-- The rest of these need an index of where in the regex string it is from
| PCarat {getDoPa::DoPa}
| PDollar {getDoPa::DoPa}
-- The following test and accept a single character
| PDot {getDoPa::DoPa} -- Any character (newline?) at all
| PAny {getDoPa::DoPa,getPatternSet::PatternSet} -- Square bracketed things
| PAnyNot {getDoPa::DoPa,getPatternSet::PatternSet} -- Inverted square bracketed things
| PEscape {getDoPa::DoPa,getPatternChar::Char} -- Backslashed Character
| PChar {getDoPa::DoPa,getPatternChar::Char} -- Specific Character
-- The following are semantic tags created in starTrans, not the parser
| PNonCapture Pattern -- introduced by starTrans
| PNonEmpty Pattern -- introduced by starTrans
deriving (Eq,Show)
-- | I have not been checking, but this should have the property that
-- parsing the resulting string should result in an identical Pattern.
-- This is not true if starTrans has created PNonCapture and PNonEmpty
-- values or a (PStar False). The contents of a "[ ]" grouping are
-- always shown in a sorted canonical order.
showPattern :: Pattern -> String
showPattern pIn =
case pIn of
PEmpty -> "()"
PGroup _ p -> paren (showPattern p)
POr ps -> concat $ intersperse "|" (map showPattern ps)
PConcat ps -> concatMap showPattern ps
PQuest p -> (showPattern p)++"?"
PPlus p -> (showPattern p)++"+"
-- If PStar has mayFirstBeNull False then reparsing will forget this flag
PStar _ p -> (showPattern p)++"*"
PBound i (Just j) p | i==j -> showPattern p ++ ('{':show i)++"}"
PBound i mj p -> showPattern p ++ ('{':show i) ++ maybe ",}" (\j -> ',':show j++"}") mj
--
PCarat _ -> "^"
PDollar _ -> "$"
PDot _ -> "."
PAny _ ps -> ('[':show ps)++"]"
PAnyNot _ ps -> ('[':'^':show ps)++"]"
PEscape _ c -> '\\':c:[]
PChar _ c -> [c]
-- The following were not directly from the parser, and will not be parsed in properly
PNonCapture p -> showPattern p
PNonEmpty p -> showPattern p
where {-
groupRange x n (y:ys) = if (fromEnum y)-(fromEnum x) == n then groupRange x (succ n) ys
else (if n <=3 then take n [x..]
else x:'-':(toEnum (pred n+fromEnum x)):[]) ++ groupRange y 1 ys
groupRange x n [] = if n <=3 then take n [x..]
else x:'-':(toEnum (pred n+fromEnum x)):[]
-}
paren s = ('(':s)++")"
data PatternSet = PatternSet (Maybe (Set Char))
(Maybe (Set PatternSetCharacterClass))
(Maybe (Set PatternSetCollatingElement))
(Maybe (Set PatternSetEquivalenceClass))
deriving (Eq)
instance Show PatternSet where
showsPrec i (PatternSet s scc sce sec) =
let (special,normal) = maybe ("","") ((partition (`elem` "]-")) . Set.toAscList) s
charSpec = (if ']' `elem` special then (']':) else id) (byRange normal)
scc' = maybe "" ((concatMap show) . Set.toList) scc
sce' = maybe "" ((concatMap show) . Set.toList) sce
sec' = maybe "" ((concatMap show) . Set.toList) sec
in shows charSpec
. showsPrec i scc' . showsPrec i sce' . showsPrec i sec'
. if '-' `elem` special then showChar '-' else id
where byRange xAll@(x:xs) | length xAll <=3 = xAll
| otherwise = groupRange x 1 xs
byRange _ = undefined
groupRange x n (y:ys) = if (fromEnum y)-(fromEnum x) == n then groupRange x (succ n) ys
else (if n <=3 then take n [x..]
else x:'-':(toEnum (pred n+fromEnum x)):[]) ++ groupRange y 1 ys
groupRange x n [] = if n <=3 then take n [x..]
else x:'-':(toEnum (pred n+fromEnum x)):[]
newtype PatternSetCharacterClass = PatternSetCharacterClass {unSCC::String}
deriving (Eq,Ord)
newtype PatternSetCollatingElement = PatternSetCollatingElement {unSCE::String}
deriving (Eq,Ord)
newtype PatternSetEquivalenceClass = PatternSetEquivalenceClass {unSEC::String}
deriving (Eq,Ord)
instance Show PatternSetCharacterClass where
showsPrec _ p = showChar '[' . showChar ':' . shows (unSCC p) . showChar ':' . showChar ']'
instance Show PatternSetCollatingElement where
showsPrec _ p = showChar '[' . showChar '.' . shows (unSCE p) . showChar '.' . showChar ']'
instance Show PatternSetEquivalenceClass where
showsPrec _ p = showChar '[' . showChar '=' . shows (unSEC p) . showChar '=' . showChar ']'
-- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- ==
-- | Do the transformation and simplification in a single traversal.
-- This removes the PPlus, PQuest, and PBound values, changing to POr
-- and PEmpty and PStar True\/False. For some PBound values it adds
-- PNonEmpty and PNonCapture semantic marker. It also simplifies to
-- flatten out nested POr and PConcat instances and eliminate some
-- unneeded PEmpty values.
starTrans :: Pattern -> Pattern
starTrans = dfsPattern (simplify' . starTrans')
-- | Apply a Pattern transfomation function depth first
dfsPattern :: (Pattern -> Pattern) -- ^ The transformation function
-> Pattern -- ^ The Pattern to transform
-> Pattern -- ^ The transformed Pattern
dfsPattern f = dfs
where unary c = f . c . dfs
dfs pattern = case pattern of
POr ps -> f (POr (map dfs ps))
PConcat ps -> f (PConcat (map dfs ps))
PGroup i p -> unary (PGroup i) p
PQuest p -> unary PQuest p
PPlus p -> unary PPlus p
PStar i p -> unary (PStar i) p
PBound i mi p -> unary (PBound i mi) p
_ -> f pattern
{- Replace by PNonCapture
unCapture = dfsPattern unCapture' where
unCapture' (PGroup (Just _) p) = PGroup Nothing p
unCapture' x = x
-}
reGroup :: Pattern -> Pattern
reGroup p@(PConcat xs) | 2 <= length xs = PGroup Nothing p
reGroup p@(POr xs) | 2 <= length xs = PGroup Nothing p
reGroup p = p
starTrans' :: Pattern -> Pattern
starTrans' pIn =
case pIn of -- We know that "p" has been simplified in each of these cases:
PQuest p -> POr [p,PEmpty]
{- The PStar should not capture 0 characters on its first iteration,
so set its mayFirstBeNull flag to False
-}
PPlus p | canOnlyMatchNull p -> p
| otherwise -> asGroup $ PConcat [reGroup p,PStar False p]
{- "An ERE matching a single character repeated by an '*' , '?' , or
an interval expression shall not match a null expression unless
this is the only match for the repetition or it is necessary to
satisfy the exact or minimum number of occurrences for the interval
expression."
-}
{- p? is p|PEmpty which prefers even a 0-character match for p
p{0,1} is p? is POr [p,PEmpty]
p{0,2} is (pp?)? NOT p?p?
p{0,3} is (p(pp?)?)?
p{1,2} is like pp{0,1} is like pp? but see below
p{2,5} is ppp{0,3} is pp(p(pp?)?)?
But this is not always right. Because if the second use of p in
p?p? matches 0 characters then the perhaps non 0 character match of
the first p is overwritten.
We need a new operation "p!" that means "p?" unless "p" match 0
characters, in which case skip p as if it failed in "p?". Thus
when p cannot accept 0 characters p! and p? are equivalent. And
when p can only match 0 characters p! is PEmpty. So for
simplicity, only use ! when p can match 0 characters but not only 0
characters.
Call this (PNonEmpty p) in the Pattern type.
p! is PNonEmpty p is POr [PEmpty,p]
IS THIS TRUE? Use QuickCheck?
Note that if p cannot match 0 characters then p! is p? and vice versa
The p{0,1} is still always p? and POr [p,PEmpty]
Now p{0,2} means p?p! or (pp!)? and p{0,3} means (p(pp!)!)? or p?p!p!
Equivalently p?p! and p?p!p!
And p{2,2} is p'p and p{3,3} is p'p'p and p{4} is p'p'p'p
The p{1,2} is pp! and p{1,3} is pp!p! or p(pp!)!
And p{2,4} means p'pp!p! and p{3,6} is p'p'pp!p!p! or p'p'p(p(pp!)!)!
But this second form still has a problem: the (pp!)! can have the first
p match 0 and the second p match non-zero. This showed up for (.|$){1,3}
since ($.!)! should not be a valid path but altered the qt_win commands.
Thus only p'p'pp!p!p! has the right semantics. For completeness:
if p can only match only 0 characters then the cases are
p{0,0} is (), p{0,_} = p?, p{_,_} is p
if p can match 0 or non-zero characters then cases are
p{0,0} is (), p{0,1} is (p)?, p{0,2} is (pp!)?, p{0,3} is (pp!p!)?
p{1,1} is p, p{1,2} is pp!, p{1,3} is pp!p!, p{1,4} is pp!p!p!
p{2,2} is p'p,
p{2,3} is p'pp!,
p{2,4} is p'pp!p! or p'p(pp!)!
p{2,5} is p'pp!p!p! or p'p(p(pp!)!)!
p{3,3} is p'p'p, p{3,4} is p'p'pp!, p{3,5} is p'p'pp!p!, p{3,6} is p'p'pp!p!p!
if p can only match 1 or more characters then cases are
p{0,0} is ()
p{0,1} is p?, p{0,2} is (pp?)?, p{0,3} is (p(pp?)?)?, p{0,4} is (pp{0,3})?
p{1,1} is p, p{1,j} is pp{0,pred j}
p{2,2} is p'p, p{2,3} is p'pp?, p{2,4} is p'p(pp?)?, p{2,5} = p'p{1,4} = p'(pp{0,3})
p{3,3} is p'p'p, p{3,4} is p'p'pp?, p{3,5} is p'p'p(pp?)?, p{3,6} is
And by this logic, the PStar False is really p*! So p{0,} is p*
and p{1,} is pp*! and p{2,} is p'pp*! and p{3,} is p'p'pp*!
The (nonEmpty' p) below is the only way PNonEmpty is introduced
into the Pattern. It is always preceded by p inside a PConcat
list. The p involved never simplifies to PEmpty. Thus it is
impossible to have PNonEmpty directly nested, i.e. (PNonEmpty
(PNonEmpty _)) never occurs even after simplifications.
The (nonCapture' p) below is the only way PNonCapture is
introduced into the Pattern. It is always followed by p inside a
PConcat list.
-}
-- Easy cases
PBound i _ _ | i<0 -> PEmpty -- impossibly malformed
PBound i (Just j) _ | i>j -> PEmpty -- impossibly malformed
PBound _ (Just 0) _ -> PEmpty
-- Medium cases
PBound 0 Nothing p | canOnlyMatchNull p -> quest p
| otherwise -> PStar True p
PBound 0 (Just 1) p -> quest p
-- Hard cases
PBound i Nothing p | canOnlyMatchNull p -> p
| otherwise -> asGroup . PConcat $ apply (nc'p:) (pred i) [reGroup p,PStar False p]
where nc'p = nonCapture' p
PBound 0 (Just j) p | canOnlyMatchNull p -> quest p
-- The first operation is quest NOT nonEmpty. This can be tested with
-- "a\nb" "((^)?|b){0,3}" and "a\nb" "((^)|b){0,3}"
| otherwise -> quest . (concat' p) $
apply (nonEmpty' . (concat' p)) (j-2) (nonEmpty' p)
{- 0.99.6 remove
| cannotMatchNull p -> apply (quest' . (concat' p)) (pred j) (quest' p)
| otherwise -> POr [ simplify' (PConcat (p : replicate (pred j) (nonEmpty' p))) , PEmpty ]
-}
{- 0.99.6 add, 0.99.7 remove
PBound i (Just j) p | canOnlyMatchNull p -> p
| i == j -> PConcat $ apply (p':) (pred i) [p]
| otherwise -> PConcat $ apply (p':) (pred i)
[p,apply (nonEmpty' . (concat' p)) (j-i-1) (nonEmpty' p) ]
where p' = nonCapture' p
-}
{- 0.99.7 add -}
PBound i (Just j) p | canOnlyMatchNull p -> p
| i == j -> asGroup . PConcat $ apply (nc'p:) (pred i) [reGroup p]
| otherwise -> asGroup . PConcat $ apply (nc'p:) (pred i)
[reGroup p,apply (nonEmpty' . (concat' p)) (j-i-1) (ne'p) ]
where nc'p = nonCapture' p
ne'p = nonEmpty' p
{- 0.99.6
| cannotMatchNull p -> PConcat $ apply (p':) (pred i) $ (p:) $
[apply (quest' . (concat' p)) (pred (j-i)) (quest' p)]
| otherwise -> PConcat $ (replicate (pred i) p') ++ p : (replicate (j-i) (nonEmpty' p))
-}
PStar mayFirstBeNull p | canOnlyMatchNull p -> if mayFirstBeNull then quest p
else PEmpty
| otherwise -> pass
-- Left intact
PEmpty -> pass
PGroup {} -> pass
POr {} -> pass
PConcat {} -> pass
PCarat {} -> pass
PDollar {} -> pass
PDot {} -> pass
PAny {} -> pass
PAnyNot {} -> pass
PEscape {} -> pass
PChar {} -> pass
PNonCapture {} -> pass
PNonEmpty {} -> pass -- TODO : remove PNonEmpty from program
where
quest = (\ p -> POr [p,PEmpty]) -- require p to have been simplified
-- quest' = (\ p -> simplify' $ POr [p,PEmpty]) -- require p to have been simplified
concat' a b = simplify' $ PConcat [reGroup a,reGroup b] -- require a and b to have been simplified
nonEmpty' = (\ p -> simplify' $ POr [PEmpty,p]) -- 2009-01-19 : this was PNonEmpty
nonCapture' = PNonCapture
apply f n x = foldr ($) x (replicate n f) -- function f applied n times to x : f^n(x)
asGroup p = PGroup Nothing (simplify' p)
pass = pIn
-- | Function to transform a pattern into an equivalent, but less
-- redundant form. Nested 'POr' and 'PConcat' are flattened. PEmpty
-- is propagated.
simplify' :: Pattern -> Pattern
simplify' x@(POr _) =
let ps' = case span notPEmpty (flatten x) of
(notEmpty,[]) -> notEmpty
(notEmpty,_:rest) -> notEmpty ++ (PEmpty:filter notPEmpty rest) -- keep 1st PEmpty only
in case ps' of
[] -> PEmpty
[p] -> p
_ -> POr ps'
simplify' x@(PConcat _) =
let ps' = filter notPEmpty (flatten x)
in case ps' of
[] -> PEmpty
[p] -> p
_ -> PConcat ps' -- PConcat ps'
simplify' (PStar _ PEmpty) = PEmpty
simplify' (PNonCapture PEmpty) = PEmpty -- 2009, perhaps useful
--simplify' (PNonEmpty PEmpty) = err "simplify' (PNonEmpty PEmpty) = should be Impossible!" -- 2009
simplify' other = other
-- | Function to flatten nested POr or nested PConcat applicataions.
flatten :: Pattern -> [Pattern]
flatten (POr ps) = (concatMap (\x -> case x of
POr ps' -> ps'
p -> [p]) ps)
flatten (PConcat ps) = (concatMap (\x -> case x of
PConcat ps' -> ps'
p -> [p]) ps)
flatten _ = err "flatten can only be applied to POr or PConcat"
notPEmpty :: Pattern -> Bool
notPEmpty PEmpty = False
notPEmpty _ = True
-- | Determines if pIn will fail or accept [] and never accept any
-- characters. Treat PCarat and PDollar as True.
canOnlyMatchNull :: Pattern -> Bool
canOnlyMatchNull pIn =
case pIn of
PEmpty -> True
PGroup _ p -> canOnlyMatchNull p
POr ps -> all canOnlyMatchNull ps
PConcat ps -> all canOnlyMatchNull ps
PQuest p -> canOnlyMatchNull p
PPlus p -> canOnlyMatchNull p
PStar _ p -> canOnlyMatchNull p
PBound _ (Just 0) _ -> True
PBound _ _ p -> canOnlyMatchNull p
PCarat _ -> True
PDollar _ -> True
PNonCapture p -> canOnlyMatchNull p
-- PNonEmpty p -> canOnlyMatchNull p -- like PQuest
_ ->False
{-
-- | If 'cannotMatchNull' returns 'True' then it is known that the
-- 'Pattern' will never accept an empty string. If 'cannotMatchNull'
-- returns 'False' then it is possible but not definite that the
-- 'Pattern' could accept an empty string.
cannotMatchNull :: Pattern -> Bool
cannotMatchNull pIn =
case pIn of
PEmpty -> False
PGroup _ p -> cannotMatchNull p
POr [] -> False
POr ps -> all cannotMatchNull ps
PConcat [] -> False
PConcat ps -> any cannotMatchNull ps
PQuest _ -> False
PPlus p -> cannotMatchNull p
PStar {} -> False
PBound 0 _ _ -> False
PBound _ _ p -> cannotMatchNull p
PCarat _ -> False
PDollar _ -> False
PNonCapture p -> cannotMatchNull p
-- PNonEmpty _ -> False -- like PQuest
_ -> True
-}