-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathfilter.hs
81 lines (70 loc) · 2.14 KB
/
filter.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import Data.Char
import Data.String.Utils
import Text.Pandoc
import Text.Pandoc.Walk
import Control.Monad
import System.FilePath.Posix
import Control.Concurrent
import System.Environment
import System.FilePath.Posix ((</>))
import System.FilePath.Find
import ArchWiki (cleanDoc)
nth :: Int -> String -> Char
nth _ [] = ' '
nth 0 (x:_) = x
nth n (_:xs) = nth (n-1) xs
interestedIn :: String -> Bool
interestedIn file | nonAscii title = False
| isLangTagged title = False
| nth 0 title == '~' = False
| otherwise = True
where title = takeFileName file
nonAscii = any (not . isAscii)
isLangTagged = isAsciiUpper . nth 1 . dropWhile (/= '(')
enDoc :: Inline -> [(String, String)]
enDoc (Link [Str t] (u,_)) | interestedIn t = [(u,t)]
| otherwise = []
enDoc _ = []
parseIndex :: FilePath -> IO [(String, String)]
parseIndex path = do
html <- readFile path
return $ query enDoc (readHtml def html)
fork :: IO () -> IO (MVar ())
fork io = do
mvar <- newEmptyMVar
forkFinally io (\_ -> putMVar mvar ())
return mvar
dstDir = "wiki/"
srcDir = "usr/share/doc/arch-wiki/html/"
trimWiki :: String -> IO String
trimWiki file = do
html <- readFile $ file
let path = makeValid $ dstDir ++ (sanitize $ takeBaseName file) ++ ".md"
writeFile path (cleanDoc html)
return file
where
sanitize = replace "/" "_"
threadPoolIO :: Int -> (a -> IO b) -> IO (Chan a, Chan b)
threadPoolIO nr mutator = do
input <- newChan
output <- newChan
forM_ [1..nr] $
\_ -> forkIO (forever $ do
i <- readChan input
o <- mutator i
writeChan output o)
return (input, output)
runPool :: Int -> [IO a] -> IO ()
runPool n as = do
(input, output) <- threadPoolIO n (id)
forM_ as $ writeChan input
replicateM_ (length as) (readChan output)
main :: IO ()
main = do
getArgs >>= work
where work [root] = do
htmls <- find always
((extension ==? ".html")
&&? (interestedIn `liftM` fileName))
(root </> srcDir)
runPool 4 (map trimWiki htmls)