From 8984fefca9b75086ed87231a4381c9aed67626bc Mon Sep 17 00:00:00 2001 From: Evan Rutledge Borden Date: Wed, 24 Oct 2018 13:05:46 -0500 Subject: [PATCH 1/2] Add high order function for checking equality after transformation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is very common to transform two `Text` values and then compare them for equality. ```hs toCaseFold x == toCaseFold y ``` This operation ends up being costly because it must materialize each newly transformed `Text` in order to check their equality. Instead we can provide a helper for such patterns `transformEq`, which relies on the underlying `ustream` to check equality and avoids materializing the transformed `Text` in the average case. Benchmarks results for this operation show a worst case, which is close to the naive comparison and a best case which is much more agreeable. ``` benchmarking transformEq/Text ==: Eq time 629.6 ns (625.7 ns .. 634.5 ns) 1.000 R² (0.999 R² .. 1.000 R²) mean 630.0 ns (626.6 ns .. 634.2 ns) std dev 12.93 ns (9.856 ns .. 16.21 ns) variance introduced by outliers: 25% (moderately inflated) benchmarking transformEq/Text transformEq: Eq time 578.4 ns (575.6 ns .. 581.2 ns) 1.000 R² (1.000 R² .. 1.000 R²) mean 577.9 ns (575.1 ns .. 581.3 ns) std dev 10.35 ns (8.092 ns .. 13.02 ns) variance introduced by outliers: 21% (moderately inflated) benchmarking transformEq/Text ==: Not Eq time 663.1 ns (659.3 ns .. 666.8 ns) 1.000 R² (1.000 R² .. 1.000 R²) mean 660.4 ns (657.9 ns .. 664.4 ns) std dev 10.49 ns (7.178 ns .. 16.44 ns) variance introduced by outliers: 17% (moderately inflated) benchmarking transformEq/Text transformEq: Not Eq time 126.9 ns (126.1 ns .. 127.8 ns) 1.000 R² (1.000 R² .. 1.000 R²) mean 126.7 ns (126.2 ns .. 127.4 ns) std dev 2.122 ns (1.663 ns .. 2.636 ns) variance introduced by outliers: 21% (moderately inflated) benchmarking transformEq/Text ==: Not Length time 499.0 ns (496.3 ns .. 502.1 ns) 1.000 R² (1.000 R² .. 1.000 R²) mean 499.8 ns (497.2 ns .. 502.9 ns) std dev 9.655 ns (7.545 ns .. 11.84 ns) variance introduced by outliers: 23% (moderately inflated) benchmarking transformEq/Text transformEq: Not Length time 17.53 ns (17.45 ns .. 17.62 ns) 1.000 R² (1.000 R² .. 1.000 R²) mean 17.65 ns (17.55 ns .. 17.78 ns) std dev 359.4 ps (283.1 ps .. 500.4 ps) variance introduced by outliers: 31% (moderately inflated) ``` --- Data/Text.hs | 10 +++++++ benchmarks/haskell/Benchmarks.hs | 2 ++ benchmarks/haskell/Benchmarks/TransformEq.hs | 30 ++++++++++++++++++++ benchmarks/text-benchmarks.cabal | 1 + 4 files changed, 43 insertions(+) create mode 100644 benchmarks/haskell/Benchmarks/TransformEq.hs diff --git a/Data/Text.hs b/Data/Text.hs index df032059..3a90040b 100644 --- a/Data/Text.hs +++ b/Data/Text.hs @@ -167,6 +167,7 @@ module Data.Text , isPrefixOf , isSuffixOf , isInfixOf + , transformEq -- ** View patterns , stripPrefix @@ -1753,6 +1754,15 @@ isInfixOf needle haystack isInfixOf (singleton n) h = S.elem n (S.stream h) #-} +-- | The 'transformEq' function takes a `Text -> Text` function +-- and two 'Text's if the 'Text's are equal after transformation. This +-- function relies on fusion to avoid materialising the transformed 'Text' +transformEq :: (Text -> Text) -> Text -> Text -> Bool +transformEq f x y + | length x == length y = P.all (P.uncurry (==)) $ zip (f x) (f y) + | otherwise = False +{-# INLINE [1] transformEq #-} + ------------------------------------------------------------------------------- -- * View patterns diff --git a/benchmarks/haskell/Benchmarks.hs b/benchmarks/haskell/Benchmarks.hs index fdecba79..04387c08 100644 --- a/benchmarks/haskell/Benchmarks.hs +++ b/benchmarks/haskell/Benchmarks.hs @@ -14,6 +14,7 @@ import qualified Benchmarks.Concat as Concat import qualified Benchmarks.DecodeUtf8 as DecodeUtf8 import qualified Benchmarks.EncodeUtf8 as EncodeUtf8 import qualified Benchmarks.Equality as Equality +import qualified Benchmarks.TransformEq as TransformEq import qualified Benchmarks.FileRead as FileRead import qualified Benchmarks.FoldLines as FoldLines import qualified Benchmarks.Mul as Mul @@ -63,6 +64,7 @@ benchmarks = do , Search.benchmark (tf "russian.txt") "принимая" , Stream.benchmark (tf "russian.txt") , WordFrequencies.benchmark (tf "russian.txt") + , TransformEq.benchmark ] -- Program-like benchmarks diff --git a/benchmarks/haskell/Benchmarks/TransformEq.hs b/benchmarks/haskell/Benchmarks/TransformEq.hs new file mode 100644 index 00000000..0063a306 --- /dev/null +++ b/benchmarks/haskell/Benchmarks/TransformEq.hs @@ -0,0 +1,30 @@ +-- | Compare a string after a transformation +-- +-- Tested in this benchmark: +-- +-- * Comparison of transformed strings via transformEq +-- +module Benchmarks.TransformEq + ( benchmark + ) where + +import Criterion (Benchmark, bgroup, bench, whnf) +import Data.Function (on) +import Data.Text (transformEq, toCaseFold, pack) + +benchmark :: IO Benchmark +benchmark = do + let + equiv = (pack "Fooooooo", pack "fOOOOOOO") + notEq = (pack "fooooooo", pack "barrrrrr") + lengthNotEq = (pack "foo", pack "foooooooo") + eq = uncurry ((==) `on` toCaseFold) + transEq = uncurry $ transformEq toCaseFold + return $ bgroup "transformEq" + [ bench "Text ==: Eq" $ whnf eq equiv + , bench "Text transformEq: Eq" $ whnf transEq equiv + , bench "Text ==: Not Eq" $ whnf eq notEq + , bench "Text transformEq: Not Eq" $ whnf transEq notEq + , bench "Text ==: Not Length" $ whnf eq lengthNotEq + , bench "Text transformEq: Not Length" $ whnf transEq lengthNotEq + ] diff --git a/benchmarks/text-benchmarks.cabal b/benchmarks/text-benchmarks.cabal index 073dfeee..1c9490a2 100644 --- a/benchmarks/text-benchmarks.cabal +++ b/benchmarks/text-benchmarks.cabal @@ -76,6 +76,7 @@ executable text-benchmarks Benchmarks.Replace Benchmarks.Search Benchmarks.Stream + Benchmarks.TransformEq Benchmarks.WordFrequencies -- Source code for IUT (implementation under test) From 2f713091379128ba3d3285df1e6893373d8c354d Mon Sep 17 00:00:00 2001 From: Evan Rutledge Borden Date: Sat, 29 Dec 2018 13:02:53 -0600 Subject: [PATCH 2/2] Use stream equality in transformEq This version is ever so slightly more efficient and utilizes the underlying stream's equality instead of using `zip` and `all`. --- Data/Text.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Data/Text.hs b/Data/Text.hs index 3a90040b..dad9bdae 100644 --- a/Data/Text.hs +++ b/Data/Text.hs @@ -1759,7 +1759,7 @@ isInfixOf needle haystack -- function relies on fusion to avoid materialising the transformed 'Text' transformEq :: (Text -> Text) -> Text -> Text -> Bool transformEq f x y - | length x == length y = P.all (P.uncurry (==)) $ zip (f x) (f y) + | length x == length y = S.stream (f x) == S.stream (f y) | otherwise = False {-# INLINE [1] transformEq #-}