125 lines
2.9 KiB
Haskell
125 lines
2.9 KiB
Haskell
{-# LANGUAGE OverloadedStrings #-}
|
||
|
||
module MyLib where
|
||
|
||
import Data.Function (fix)
|
||
import Data.Text (Text)
|
||
import qualified Data.Text as T
|
||
import Data.Void (Void)
|
||
import Replace.Megaparsec (streamEdit)
|
||
import Text.Megaparsec
|
||
import Text.Megaparsec.Char
|
||
|
||
-------------------------------------------------------------------------------
|
||
type Parser = Parsec Void Text
|
||
|
||
type Rule = Parser Text
|
||
|
||
type RuleSet = [Rule]
|
||
|
||
applyUntilFixed :: Rule -> Text -> Text
|
||
applyUntilFixed rule =
|
||
fix
|
||
( \loop current ->
|
||
let next = streamEdit rule id current
|
||
in if next == current then next else loop next
|
||
)
|
||
|
||
applyRules :: RuleSet -> Text -> Text
|
||
applyRules rules input = foldl (flip applyUntilFixed) input rules
|
||
|
||
-------------------------------------------------------------------------------
|
||
-- rules for pangu
|
||
|
||
-- | Check if a character falls within the CJK ranges provided
|
||
isCJK :: Char -> Bool
|
||
isCJK c = any (\(start, end) -> c >= start && c <= end) cjkRanges
|
||
where
|
||
cjkRanges =
|
||
[ ('\x2e80', '\x2eff'),
|
||
('\x2f00', '\x2fdf'),
|
||
('\x3040', '\x309f'),
|
||
('\x30a0', '\x30fa'),
|
||
('\x30fc', '\x30ff'),
|
||
('\x3100', '\x312f'),
|
||
('\x3200', '\x32ff'),
|
||
('\x3400', '\x4dbf'),
|
||
('\x4e00', '\x9fff'),
|
||
('\xf900', '\xfaff')
|
||
]
|
||
|
||
convertToFullwidth :: Char -> Char
|
||
convertToFullwidth c =
|
||
case c of
|
||
':' -> ':'
|
||
'.' -> '。'
|
||
'~' -> '~'
|
||
'!' -> '!'
|
||
'?' -> '?'
|
||
',' -> ','
|
||
';' -> ';'
|
||
'\"' -> '”'
|
||
'\'' -> '’'
|
||
_ -> c
|
||
|
||
-- A parser that matches a single CJK character
|
||
cjkChar :: Parser Char
|
||
cjkChar = satisfy isCJK
|
||
|
||
-- use python.py as reference for these rules
|
||
|
||
fullwidthCJKsymCJK :: Rule
|
||
fullwidthCJKsymCJK = do
|
||
lcjk <- cjkChar
|
||
_ <- many (char ' ')
|
||
sym <- fmap T.unpack (chunk ".") <|> some (oneOf (":" :: [Char]))
|
||
_ <- many (char ' ')
|
||
rcjk <- cjkChar
|
||
|
||
let transformedsym = T.pack $ map convertToFullwidth sym
|
||
return $ T.pack [lcjk] <> transformedsym <> T.pack [rcjk]
|
||
|
||
fullwidthCJKsym :: Rule
|
||
fullwidthCJKsym = do
|
||
cjk <- cjkChar
|
||
_ <- many (char ' ')
|
||
sym <- some $ oneOf ("~!?,;" :: [Char])
|
||
_ <- many (char ' ')
|
||
let transformedsym = T.pack $ map convertToFullwidth sym
|
||
return $ T.pack [cjk] <> transformedsym
|
||
|
||
dotsCJK :: Rule
|
||
dotsCJK = do
|
||
dots <- chunk "..." <|> chunk "…"
|
||
cjk <- cjkChar
|
||
return $ dots <> T.pack (" " ++ [cjk])
|
||
|
||
fixCJKcolAN :: Rule
|
||
fixCJKcolAN = do
|
||
cjk <- cjkChar
|
||
_ <- chunk ":"
|
||
an <- alphaNumChar
|
||
return $ T.pack $ [cjk] ++ ":" ++ [an]
|
||
|
||
cjkquote :: Rule
|
||
cjkquote = do
|
||
cjk <- cjkChar
|
||
quote <- oneOf ("\x05f4\"\'" :: [Char])
|
||
return $ T.pack $ [cjk] ++ " " ++ [quote]
|
||
|
||
quoteCJK :: Rule
|
||
quoteCJK = do
|
||
quote <- oneOf ("\x05f4\"\'" :: [Char])
|
||
cjk <- cjkChar
|
||
return $ T.pack $ [quote] ++ " " ++ [cjk]
|
||
|
||
-- the rule set
|
||
myRules :: RuleSet
|
||
myRules =
|
||
[ fullwidthCJKsymCJK,
|
||
fullwidthCJKsym,
|
||
dotsCJK,
|
||
fixCJKcolAN,
|
||
cjkquote,
|
||
quoteCJK
|
||
] |