Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ fixed #119 ] latin1 encoding: each byte counts as 1 char #156

Merged
merged 1 commit into from
Jan 27, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions alex.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,21 @@ data-dir: data/

data-files:
AlexTemplate
AlexTemplate-debug
AlexTemplate-nopred
AlexTemplate-nopred-debug
AlexTemplate-latin1
AlexTemplate-latin1-debug
AlexTemplate-latin1-nopred
AlexTemplate-latin1-nopred-debug
AlexTemplate-ghc
AlexTemplate-ghc-nopred
AlexTemplate-ghc-debug
AlexTemplate-debug
AlexTemplate-ghc-nopred
AlexTemplate-ghc-nopred-debug
AlexTemplate-ghc-latin1
AlexTemplate-ghc-latin1-debug
AlexTemplate-ghc-latin1-nopred
AlexTemplate-ghc-latin1-nopred-debug
AlexWrapper-basic
AlexWrapper-basic-bytestring
AlexWrapper-strict-bytestring
Expand Down Expand Up @@ -110,6 +121,7 @@ extra-source-files:
tests/strict_typeclass.x
tests/unicode.x
tests/issue_71.x
tests/issue_119.x

source-repository head
type: git
Expand Down
39 changes: 32 additions & 7 deletions gen-alex-sdist/Main.hs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
module Main (main) where

import Control.Monad
import qualified Data.List as List
import Language.Preprocessor.Cpphs
import System.Directory
import System.FilePath
Expand Down Expand Up @@ -51,13 +52,37 @@ all_template_files :: [FilePath]
all_template_files = map fst (templates ++ wrappers)

templates :: [(FilePath,[String])]
templates = [
("AlexTemplate", []),
("AlexTemplate-ghc", ["ALEX_GHC"]),
("AlexTemplate-ghc-nopred",["ALEX_GHC", "ALEX_NOPRED"]),
("AlexTemplate-ghc-debug", ["ALEX_GHC","ALEX_DEBUG"]),
("AlexTemplate-debug", ["ALEX_DEBUG"])
]
templates =
[ ( templateFileName ghc latin1 nopred debug
, templateFlags ghc latin1 nopred debug
)
| ghc <- allBool
, latin1 <- allBool
, nopred <- allBool
, debug <- allBool
]
where
allBool = [False, True]

-- Keep this function in sync with its twin in src/Main.hs.
templateFileName :: Bool -> Bool -> Bool -> Bool -> FilePath
templateFileName ghc latin1 nopred debug =
List.intercalate "-" $ concat
[ [ "AlexTemplate" ]
, [ "ghc" | ghc ]
, [ "latin1" | latin1 ]
, [ "nopred" | nopred ]
, [ "debug" | debug ]
]

templateFlags :: Bool -> Bool -> Bool -> Bool -> [String]
templateFlags ghc latin1 nopred debug =
map ("ALEX_" ++) $ concat
[ [ "GHC" | ghc ]
, [ "LATIN1" | latin1 ]
, [ "NOPRED" | nopred ]
, [ "DEBUG" | debug ]
]

wrappers :: [(FilePath,[String])]
wrappers = [
Expand Down
2 changes: 2 additions & 0 deletions src/AbsSyn.hs
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ type StartCode = Int
-- we can generate somewhat faster code in the case that
-- the lexer doesn't use predicates
data UsesPreds = UsesPreds | DoesntUsePreds
deriving Eq

usesPreds :: DFA s a -> UsesPreds
usesPreds dfa
Expand Down Expand Up @@ -390,3 +391,4 @@ extractActions scheme scanner = (scanner{scannerTokens = new_tokens}, decl_str .
-- Code generation targets

data Target = GhcTarget | HaskellTarget
deriving Eq
41 changes: 23 additions & 18 deletions src/Main.hs
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import Control.Exception ( bracketOnError )
import Control.Monad ( when, liftM )
import Data.Char ( chr )
import Data.List ( isSuffixOf, nub )
import qualified Data.List as List
import Data.Maybe ( isJust, fromJust )
import Data.Version ( showVersion )
import System.Console.GetOpt ( getOpt, usageInfo, ArgOrder(..), OptDescr(..), ArgDescr(..) )
Expand Down Expand Up @@ -218,7 +219,7 @@ alex cli file basename script = do
hPutStr out_h (actions "")

-- add the template
let template_name = templateFile template_dir target usespreds cli
let template_name = templateFile template_dir target encoding usespreds cli
tmplt <- alexReadFile template_name
hPutStr out_h tmplt

Expand Down Expand Up @@ -399,23 +400,27 @@ templateDir def cli
[] -> def
ds -> return (last ds)

templateFile :: FilePath -> Target -> UsesPreds -> [CLIFlags] -> FilePath
templateFile dir target usespreds cli
= dir ++ "/AlexTemplate" ++ maybe_ghc ++ maybe_debug ++ maybe_nopred
where
maybe_ghc = case target of
GhcTarget -> "-ghc"
_ -> ""

maybe_debug
| OptDebugParser `elem` cli = "-debug"
| otherwise = ""

maybe_nopred =
case usespreds of
DoesntUsePreds | not (null maybe_ghc)
&& null maybe_debug -> "-nopred"
_ -> ""
-- Keep this function in sync with its twin in gen-alex-sdist/Main.hs.
templateFileName :: Bool -> Bool -> Bool -> Bool -> FilePath
templateFileName ghc latin1 nopred debug =
List.intercalate "-" $ concat
[ [ "AlexTemplate" ]
, [ "ghc" | ghc ]
, [ "latin1" | latin1 ]
, [ "nopred" | nopred ]
, [ "debug" | debug ]
]

templateFile :: FilePath -> Target -> Encoding -> UsesPreds -> [CLIFlags] -> FilePath
templateFile dir target encoding usespreds cli = concat
[ dir
, "/"
, templateFileName
(target == GhcTarget)
(encoding == Latin1)
(usespreds == DoesntUsePreds)
(OptDebugParser `elem` cli)
]

wrapperFile :: FilePath -> Scheme -> Maybe FilePath
wrapperFile dir scheme =
Expand Down
12 changes: 9 additions & 3 deletions templates/GenericTemplate.hs
Original file line number Diff line number Diff line change
Expand Up @@ -175,9 +175,15 @@ alex_scan_tkn user__ orig_input len input__ s last_acc =
ILIT(-1) -> (new_acc, input__)
-- on an error, we want to keep the input *before* the
-- character that failed, not after.
_ -> alex_scan_tkn user__ orig_input (if c < 0x80 || c >= 0xC0 then PLUS(len,ILIT(1)) else len)
-- note that the length is increased ONLY if this is the 1st byte in a char encoding)
new_input new_s new_acc
_ -> alex_scan_tkn user__ orig_input
#ifdef ALEX_LATIN1
PLUS(len,ILIT(1))
-- issue 119: in the latin1 encoding, *each* byte is one character
#else
(if c < 0x80 || c >= 0xC0 then PLUS(len,ILIT(1)) else len)
-- note that the length is increased ONLY if this is the 1st byte in a char encoding)
#endif
new_input new_s new_acc
}
where
check_accs (AlexAccNone) = last_acc
Expand Down
1 change: 1 addition & 0 deletions tests/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ TESTS = \
default_typeclass.x \
gscan_typeclass.x \
issue_71.x \
issue_119.x \
monad_typeclass.x \
monad_typeclass_bytestring.x \
monadUserState_typeclass.x \
Expand Down
73 changes: 73 additions & 0 deletions tests/issue_119.x
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
-- -*- haskell -*-
{
-- Issue 119,
-- reported 2017-10-11 by Herbert Valerio Riedel,
-- fixed 2020-01-26 by Andreas Abel.
--
-- Problem was: the computed token length (in number of characters)
-- attached to AlexToken is tailored to UTF8 encoding and wrong
-- for LATIN1 encoding.

module Main where

import Control.Monad (unless)
import qualified Data.ByteString as B
import Data.Word
import System.Exit (exitFailure)
}

%encoding "latin1"

:-

[\x01-\xff]+ { False }
[\x00] { True }

{
type AlexInput = B.ByteString

alexGetByte :: AlexInput -> Maybe (Word8,AlexInput)
alexGetByte = B.uncons

alexInputPrevChar :: AlexInput -> Char
alexInputPrevChar = undefined

-- generated by @alex@
alexScan :: AlexInput -> Int -> AlexReturn Bool

{-

GOOD cases:

("012\NUL3","012","\NUL3",3,3,False)
("\NUL0","\NUL","0",1,1,True)
("012","012","",3,3,False)

BAD case:

("0@P`p\128\144\160","0@P`p","",5,8,False)

expected:

("0@P`p\128\144\160","0@P`p\128\144\160","",8,8,False)

-}
main :: IO ()
main = do
go (B.pack [0x30,0x31,0x32,0x00,0x33]) -- GOOD
go (B.pack [0x00,0x30]) -- GOOD
go (B.pack [0x30,0x31,0x32]) -- GOOD

go (B.pack [0x30,0x40,0x50,0x60,0x70,0x80,0x90,0xa0]) -- WAS: BAD
where
go inp = do
case (alexScan inp 0) of
-- expected invariant: len == B.length inp - B.length inp'
AlexToken inp' len b -> do
let diff = B.length inp - B.length inp'
unless (len == diff) $ do
putStrLn $ "ERROR: reported length and consumed length differ!"
print (inp, B.take len inp, inp', len, diff, b)
exitFailure
_ -> undefined
}