Skip to content

Commit 60d5932

Browse files
authored
Merge pull request #155 from andreasabel/issue71
[ #71 ] warn about nullable regexs in the absence of start codes
2 parents 25c8c6d + 564f22a commit 60d5932

File tree

4 files changed

+108
-44
lines changed

4 files changed

+108
-44
lines changed

src/AbsSyn.hs

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ module AbsSyn (
1414
wrapperName,
1515
Scanner(..),
1616
RECtx(..),
17-
RExp(..),
17+
RExp(..), nullable,
1818
DFA(..), State(..), SNum, StartCode, Accept(..),
1919
RightContext(..), showRCtx, strtype,
2020
encodeStartCodes, extractActions,
@@ -188,21 +188,21 @@ usesPreds dfa
188188
-- Regular expressions
189189

190190
-- `RExp' provides an abstract syntax for regular expressions. `Eps' will
191-
-- match empty strings; `Ch p' matches strings containinng a single character
191+
-- match empty strings; `Ch p' matches strings containing a single character
192192
-- `c' if `p c' is true; `re1 :%% re2' matches a string if `re1' matches one of
193193
-- its prefixes and `re2' matches the rest; `re1 :| re2' matches a string if
194194
-- `re1' or `re2' matches it; `Star re', `Plus re' and `Ques re' can be
195195
-- expressed in terms of the other operators. See the definitions of `ARexp'
196196
-- for a formal definition of the semantics of these operators.
197197

198198
data RExp
199-
= Eps
200-
| Ch CharSet
201-
| RExp :%% RExp
202-
| RExp :| RExp
203-
| Star RExp
204-
| Plus RExp
205-
| Ques RExp
199+
= Eps -- ^ Empty.
200+
| Ch CharSet -- ^ Singleton.
201+
| RExp :%% RExp -- ^ Sequence.
202+
| RExp :| RExp -- ^ Alternative.
203+
| Star RExp -- ^ Zero or more repetitions.
204+
| Plus RExp -- ^ One or more repetitions.
205+
| Ques RExp -- ^ Zero or one repetitions.
206206

207207
instance Show RExp where
208208
showsPrec _ Eps = showString "()"
@@ -213,6 +213,17 @@ instance Show RExp where
213213
showsPrec _ (Plus r) = shows r . ('+':)
214214
showsPrec _ (Ques r) = shows r . ('?':)
215215

216+
-- | A regular expression is nullable if it matches the empty string.
217+
nullable :: RExp -> Bool
218+
nullable Eps = True
219+
nullable Ch{} = False
220+
nullable (l :%% r) = nullable l && nullable r
221+
nullable (l :| r) = nullable l || nullable r
222+
nullable Star{} = True
223+
nullable (Plus r) = nullable r
224+
nullable Ques{} = True
225+
226+
216227
{------------------------------------------------------------------------------
217228
Abstract Regular Expression
218229
------------------------------------------------------------------------------}

src/Main.hs

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ import Info
1818
import Map ( Map )
1919
import qualified Map hiding ( Map )
2020
import Output
21-
import ParseMonad ( runP )
21+
import ParseMonad ( runP, Warning(..) )
2222
import Parser
2323
import Scan
2424
import Util ( hline )
@@ -114,11 +114,20 @@ parseScript file prg =
114114
Left (Nothing, err) ->
115115
die (file ++ ": " ++ err ++ "\n")
116116

117-
Right script@(_, _, scanner, _) -> do
117+
Right (warnings, script@(_, _, scanner, _)) -> do
118118
-- issue 46: give proper error when lexer definition is empty
119119
when (null $ scannerTokens scanner) $
120120
dieAlex $ file ++ " contains no lexer rules\n"
121+
-- issue 71: warn about nullable regular expressions
122+
mapM_ printWarning warnings
121123
return script
124+
where
125+
printWarning (WarnNullableRExp (AlexPn _ line col) msg) =
126+
hPutStrLn stderr $ concat
127+
[ "Warning: "
128+
, file , ":", show line , ":" , show col , ": "
129+
, msg
130+
]
122131

123132
alex :: [CLIFlags]
124133
-> FilePath

src/ParseMonad.hs

Lines changed: 60 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
-- -----------------------------------------------------------------------------
2-
--
2+
--
33
-- ParseMonad.hs, part of Alex
44
--
55
-- (c) Simon Marlow 2003
@@ -9,7 +9,7 @@
99
module ParseMonad (
1010
AlexInput, alexInputPrevChar, alexGetChar, alexGetByte,
1111
AlexPosn(..), alexStartPos,
12-
12+
Warning(..), warnIfNullable,
1313
P, runP, StartCode, failP, lookupSMac, lookupRMac, newSMac, newRMac,
1414
setStartCode, getStartCode, getInput, setInput,
1515
) where
@@ -23,7 +23,7 @@ import UTF8
2323
#if __GLASGOW_HASKELL__ < 710
2424
import Control.Applicative ( Applicative(..) )
2525
#endif
26-
import Control.Monad ( liftM, ap )
26+
import Control.Monad ( liftM, ap, when )
2727
import Data.Word (Word8)
2828
-- -----------------------------------------------------------------------------
2929
-- The input type
@@ -49,15 +49,15 @@ alexGetChar (_, _ ,_ : _, _) = undefined -- hide compiler warning
4949
alexGetByte :: AlexInput -> Maybe (Byte,AlexInput)
5050
alexGetByte (p,c,(b:bs),s) = Just (b,(p,c,bs,s))
5151
alexGetByte (_,_,[],[]) = Nothing
52-
alexGetByte (p,_,[],(c:s)) = let p' = alexMove p c
52+
alexGetByte (p,_,[],(c:s)) = let p' = alexMove p c
5353
(b:bs) = UTF8.encode c
5454
in p' `seq` Just (b, (p', c, bs, s))
5555

5656
-- -----------------------------------------------------------------------------
5757
-- Token positions
5858

5959
-- `Posn' records the location of a token in the input text. It has three
60-
-- fields: the address (number of chacaters preceding the token), line number
60+
-- fields: the address (number of charaters preceding the token), line number
6161
-- and column of a token within the file. `start_pos' gives the position of the
6262
-- start of the file and `eof_pos' a standard encoding for the end of file.
6363
-- `move_pos' calculates the new position after traversing a given character,
@@ -77,15 +77,22 @@ alexMove (AlexPn a l c) _ = AlexPn (a+1) l (c+1)
7777
-- -----------------------------------------------------------------------------
7878
-- Alex lexing/parsing monad
7979

80+
data Warning
81+
= WarnNullableRExp
82+
{ _warnPos :: AlexPosn -- ^ The position of the code following the regex.
83+
, _warnText :: String -- ^ Warning text.
84+
}
85+
8086
type ParseError = (Maybe AlexPosn, String)
8187
type StartCode = Int
8288

83-
data PState = PState {
84-
smac_env :: Map String CharSet,
85-
rmac_env :: Map String RExp,
86-
startcode :: Int,
87-
input :: AlexInput
88-
}
89+
data PState = PState
90+
{ warnings :: [Warning] -- ^ Stack of warnings, top = last warning.
91+
, smac_env :: Map String CharSet
92+
, rmac_env :: Map String RExp
93+
, startcode :: Int
94+
, input :: AlexInput
95+
}
8996

9097
newtype P a = P { unP :: PState -> Either ParseError (PState,a) }
9198

@@ -102,15 +109,27 @@ instance Monad P where
102109
Right (env',ok) -> unP (k ok) env'
103110
return = pure
104111

105-
runP :: String -> (Map String CharSet, Map String RExp)
106-
-> P a -> Either ParseError a
107-
runP str (senv,renv) (P p)
112+
-- | Run the parser on given input.
113+
runP :: String
114+
-- ^ Input string.
115+
-> (Map String CharSet, Map String RExp)
116+
-- ^ Character set and regex definitions.
117+
-> P a
118+
-- ^ Parsing computation.
119+
-> Either ParseError ([Warning], a)
120+
-- ^ List of warnings in first-to-last order, result.
121+
runP str (senv,renv) (P p)
108122
= case p initial_state of
109123
Left err -> Left err
110-
Right (_,a) -> Right a
111-
where initial_state =
112-
PState{ smac_env=senv, rmac_env=renv,
113-
startcode = 0, input=(alexStartPos,'\n',[],str) }
124+
Right (s, a) -> Right (reverse (warnings s), a)
125+
where
126+
initial_state = PState
127+
{ warnings = []
128+
, smac_env = senv
129+
, rmac_env = renv
130+
, startcode = 0
131+
, input = (alexStartPos, '\n', [], str)
132+
}
114133

115134
failP :: String -> P a
116135
failP str = P $ \PState{ input = (p,_,_,_) } -> Left (Just p,str)
@@ -121,24 +140,24 @@ failP str = P $ \PState{ input = (p,_,_,_) } -> Left (Just p,str)
121140

122141
lookupSMac :: (AlexPosn,String) -> P CharSet
123142
lookupSMac (posn,smac)
124-
= P $ \s@PState{ smac_env = senv } ->
143+
= P $ \s@PState{ smac_env = senv } ->
125144
case Map.lookup smac senv of
126145
Just ok -> Right (s,ok)
127146
Nothing -> Left (Just posn, "unknown set macro: $" ++ smac)
128147

129148
lookupRMac :: String -> P RExp
130-
lookupRMac rmac
131-
= P $ \s@PState{ rmac_env = renv } ->
149+
lookupRMac rmac
150+
= P $ \s@PState{ rmac_env = renv } ->
132151
case Map.lookup rmac renv of
133152
Just ok -> Right (s,ok)
134153
Nothing -> Left (Nothing, "unknown regex macro: %" ++ rmac)
135154

136155
newSMac :: String -> CharSet -> P ()
137-
newSMac smac set
156+
newSMac smac set
138157
= P $ \s -> Right (s{smac_env = Map.insert smac set (smac_env s)}, ())
139158

140159
newRMac :: String -> RExp -> P ()
141-
newRMac rmac rexp
160+
newRMac rmac rexp
142161
= P $ \s -> Right (s{rmac_env = Map.insert rmac rexp (rmac_env s)}, ())
143162

144163
setStartCode :: StartCode -> P ()
@@ -152,3 +171,21 @@ getInput = P $ \s -> Right (s, input s)
152171

153172
setInput :: AlexInput -> P ()
154173
setInput inp = P $ \s -> Right (s{ input = inp }, ())
174+
175+
-- | Add a warning if given regular expression is nullable
176+
-- unless the user wrote the regex 'Eps'.
177+
warnIfNullable
178+
:: RExp -- ^ Regular expression.
179+
-> AlexPosn -- ^ Position associated to regular expression.
180+
-> P ()
181+
-- If the user wrote @()@, they wanted to match the empty sequence!
182+
-- Thus, skip the warning then.
183+
warnIfNullable Eps _ = return ()
184+
warnIfNullable r pos = when (nullable r) $ P $ \ s ->
185+
Right (s{ warnings = WarnNullableRExp pos w : warnings s}, ())
186+
where
187+
w = unwords
188+
[ "Regular expression"
189+
, show r
190+
, "matches the empty string."
191+
]

src/Parser.y

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -109,16 +109,23 @@ tokendefs :: { [RECtx] }
109109
| {- empty -} { [] }
110110
111111
tokendef :: { [RECtx] }
112-
: startcodes rule { [ replaceCodes $1 $2 ] }
112+
: startcodes rule { [ replaceCodes $1 (snd $2) ] }
113113
| startcodes '{' rules '}' { map (replaceCodes $1) $3 }
114-
| rule { [ $1 ] }
115-
116-
rule :: { RECtx }
117-
: context rhs { let (l,e,r) = $1 in
118-
RECtx [] l e r $2 }
114+
| rule {% do
115+
let (pos, res@(RECtx _ _ e _ _)) = $1
116+
warnIfNullable e pos
117+
return [ res ]
118+
}
119+
120+
rule :: { (AlexPosn, RECtx) }
121+
: context rhs { let
122+
(l, e, r) = $1
123+
(pos, code) = $2
124+
in (pos, RECtx [] l e r code)
125+
}
119126
120127
rules :: { [RECtx] }
121-
: rule rules { $1 : $2 }
128+
: rule rules { snd $1 : $2 }
122129
| {- empty -} { [] }
123130
124131
startcodes :: { [(String,StartCode)] }
@@ -132,9 +139,9 @@ startcode :: { String }
132139
: ZERO { "0" }
133140
| ID { $1 }
134141
135-
rhs :: { Maybe Code }
136-
: CODE { case $1 of T _ (CodeT code) -> Just code }
137-
| ';' { Nothing }
142+
rhs :: { (AlexPosn, Maybe Code) }
143+
: CODE { case $1 of T pos (CodeT code) -> (pos, Just code) }
144+
| ';' { (tokPosn $1, Nothing) }
138145
139146
context :: { Maybe CharSet, RExp, RightContext RExp }
140147
: left_ctx rexp right_ctx { (Just $1,$2,$3) }

0 commit comments

Comments
 (0)