mirror of
http://101.35.51.105:3000/congyu/Hakysidian.git
synced 2026-04-28 05:50:49 +08:00
first commit
This commit is contained in:
+227
@@ -0,0 +1,227 @@
|
||||
{-# LANGUAGE OverloadedStrings #-}
|
||||
|
||||
module Pangu (pangu, isCJK) where
|
||||
|
||||
import Data.Function (fix)
|
||||
import Data.Text (Text)
|
||||
import qualified Data.Text as T
|
||||
import Data.Void (Void)
|
||||
import Replace.Megaparsec (streamEdit)
|
||||
import Text.Megaparsec
|
||||
import Text.Megaparsec.Char
|
||||
|
||||
-------------------------------------------------------------------------------
|
||||
type Parser = Parsec Void Text
|
||||
|
||||
type Rule = Parser Text
|
||||
|
||||
type RuleSet = [Rule]
|
||||
|
||||
applyUntilFixed :: Rule -> Text -> Text
|
||||
applyUntilFixed rule =
|
||||
fix
|
||||
( \loop current ->
|
||||
let next = streamEdit (try rule) id current
|
||||
in if next == current then next else loop next
|
||||
)
|
||||
|
||||
applyRulesRecursively :: RuleSet -> Text -> Text
|
||||
applyRulesRecursively rules input = foldl (flip applyUntilFixed) input rules
|
||||
|
||||
applyRules :: RuleSet -> Text -> Text
|
||||
applyRules rules input = foldl (flip applyOnce) input rules
|
||||
where
|
||||
applyOnce rule = streamEdit (try rule) id
|
||||
|
||||
-------------------------------------------------------------------------------
|
||||
-- rules for pangu
|
||||
|
||||
-- alphaNumChar from megaparsec matches CJK chars...
|
||||
-- need to implement a new one
|
||||
alphanumericChar :: Parser Char
|
||||
alphanumericChar = satisfy $ \c ->
|
||||
(c >= 'a' && c <= 'z')
|
||||
|| (c >= 'A' && c <= 'Z')
|
||||
|| (c >= '0' && c <= '9')
|
||||
|
||||
-- | Check if a character falls within the CJK ranges provided
|
||||
isCJK :: Char -> Bool
|
||||
isCJK c = any (\(start, end) -> c >= start && c <= end) cjkRanges
|
||||
where
|
||||
cjkRanges =
|
||||
[ ('\x2e80', '\x2eff'),
|
||||
('\x2f00', '\x2fdf'),
|
||||
('\x3040', '\x309f'),
|
||||
('\x30a0', '\x30fa'),
|
||||
('\x30fc', '\x30ff'),
|
||||
('\x3100', '\x312f'),
|
||||
('\x3200', '\x32ff'),
|
||||
('\x3400', '\x4dbf'),
|
||||
('\x4e00', '\x9fff'),
|
||||
('\xf900', '\xfaff')
|
||||
]
|
||||
|
||||
convertToFullwidth :: Char -> Char
|
||||
convertToFullwidth c =
|
||||
case c of
|
||||
':' -> ':'
|
||||
'.' -> '。'
|
||||
'~' -> '~'
|
||||
'!' -> '!'
|
||||
'?' -> '?'
|
||||
',' -> ','
|
||||
';' -> ';'
|
||||
'\"' -> '”'
|
||||
'\'' -> '’'
|
||||
_ -> c
|
||||
|
||||
-- A parser that matches a single CJK character
|
||||
cjkChar :: Parser Char
|
||||
cjkChar = satisfy isCJK
|
||||
|
||||
-- use python.py as reference for these rules
|
||||
|
||||
fullwidthCJKsymCJK :: Rule
|
||||
fullwidthCJKsymCJK = do
|
||||
lcjk <- cjkChar
|
||||
_ <- many (char ' ')
|
||||
sym <- try (some (char ':')) <|> count 1 (char '.')
|
||||
_ <- many (char ' ')
|
||||
rcjk <- cjkChar
|
||||
let transformedsym = map convertToFullwidth sym
|
||||
return $ T.pack $ [lcjk] ++ transformedsym ++ [rcjk]
|
||||
|
||||
fullwidthCJKsym :: Rule
|
||||
fullwidthCJKsym = do
|
||||
cjk <- cjkChar
|
||||
_ <- many (char ' ')
|
||||
sym <- some $ oneOf ("~!?,;" :: [Char])
|
||||
_ <- many (char ' ')
|
||||
let transformedsym = T.pack $ map convertToFullwidth sym
|
||||
return $ T.pack [cjk] <> transformedsym
|
||||
|
||||
dotsCJK :: Rule
|
||||
dotsCJK = do
|
||||
dots <- chunk "..." <|> chunk "…"
|
||||
cjk <- cjkChar
|
||||
return $ dots <> T.pack (" " ++ [cjk])
|
||||
|
||||
fixCJKcolAN :: Rule
|
||||
fixCJKcolAN = do
|
||||
cjk <- cjkChar
|
||||
_ <- char ':'
|
||||
an <- alphanumericChar
|
||||
return $ T.pack $ [cjk] ++ ":" ++ [an]
|
||||
|
||||
-- quotes
|
||||
-- seems confusing ...
|
||||
quotesym :: [Char]
|
||||
quotesym = "'`\x05f4\""
|
||||
|
||||
cjkquote :: Rule
|
||||
cjkquote = do
|
||||
cjk <- cjkChar
|
||||
quote <- oneOf quotesym
|
||||
return $ T.pack $ [cjk] ++ " " ++ [quote]
|
||||
|
||||
quoteCJK :: Rule
|
||||
quoteCJK = do
|
||||
quote <- oneOf quotesym
|
||||
cjk <- cjkChar
|
||||
return $ T.pack $ [quote] ++ " " ++ [cjk]
|
||||
|
||||
fixQuote :: Rule
|
||||
fixQuote = do
|
||||
openQuotes <- T.pack <$> some (oneOf quotesym)
|
||||
_ <- many spaceChar
|
||||
content <- T.pack <$> someTill anySingle (lookAhead $ some (oneOf quotesym))
|
||||
closeQuotes <- T.pack <$> some (oneOf quotesym)
|
||||
return $ openQuotes <> T.strip content <> closeQuotes
|
||||
|
||||
cjkpossessivequote :: Rule
|
||||
cjkpossessivequote = do
|
||||
cjk <- cjkChar
|
||||
_ <- char '\''
|
||||
_ <- lookAhead $ anySingleBut 's'
|
||||
return $ T.pack $ cjk : " '"
|
||||
|
||||
-- This singlequoteCJK rule will turn '你好' into ' 你好'
|
||||
-- which seems not desirable...
|
||||
-- however, the behavior is aligned with python version
|
||||
singlequoteCJK :: Rule
|
||||
singlequoteCJK = do
|
||||
_ <- char '\''
|
||||
cjk <- cjkChar
|
||||
return $ T.pack $ "' " ++ [cjk]
|
||||
|
||||
fixPossessivequote :: Rule
|
||||
fixPossessivequote = do
|
||||
pre <- cjkChar <|> alphanumericChar
|
||||
_ <- some spaceChar
|
||||
_ <- chunk "'s"
|
||||
return $ T.pack $ pre : "'s"
|
||||
|
||||
-- hash
|
||||
hashANSCJKhash :: Rule
|
||||
hashANSCJKhash = do
|
||||
cjk1 <- cjkChar
|
||||
_ <- char '#'
|
||||
mid <- some cjkChar
|
||||
_ <- char '#'
|
||||
cjk2 <- cjkChar
|
||||
return $ T.pack $ [cjk1] ++ " #" ++ mid ++ "# " ++ [cjk2]
|
||||
|
||||
cjkhash :: Rule
|
||||
cjkhash = do
|
||||
cjk <- cjkChar
|
||||
_ <- char '#'
|
||||
_ <- lookAhead $ anySingleBut ' '
|
||||
return $ T.pack $ cjk : " #"
|
||||
|
||||
hashcjk :: Rule
|
||||
hashcjk = do
|
||||
_ <- char '#'
|
||||
_ <- lookAhead $ anySingleBut ' '
|
||||
cjk <- cjkChar
|
||||
return $ T.pack $ "# " ++ [cjk]
|
||||
|
||||
-- operators
|
||||
cjkOPTan :: Rule
|
||||
cjkOPTan = do
|
||||
cjk <- cjkChar
|
||||
opt <- oneOf ("+-=*/&|<>%" :: [Char])
|
||||
an <- alphanumericChar
|
||||
return $ T.pack [cjk, ' ', opt, ' ', an]
|
||||
|
||||
anOPTcjk :: Rule
|
||||
anOPTcjk = do
|
||||
an <- alphanumericChar
|
||||
opt <- oneOf ("+-=*/&|<>%" :: [Char])
|
||||
cjk <- cjkChar
|
||||
return $ T.pack [an, ' ', opt, ' ', cjk]
|
||||
|
||||
-- slash/bracket rules are not implemented
|
||||
|
||||
-- CJK and alphanumeric without space
|
||||
|
||||
cjkans :: Rule
|
||||
cjkans = do
|
||||
cjk <- cjkChar
|
||||
_ <- lookAhead (alphanumericChar <|> oneOf ("@$%^&*-+\\=|/" :: [Char]))
|
||||
return $ T.pack [cjk, ' ']
|
||||
|
||||
anscjk :: Rule
|
||||
anscjk = do
|
||||
an <- alphanumericChar <|> oneOf ("~!$%^&*-+\\=|;:,./?" :: [Char])
|
||||
_ <- lookAhead cjkChar
|
||||
return $ T.pack [an, ' ']
|
||||
|
||||
-- rule set, the order matters
|
||||
recursiveRules :: RuleSet
|
||||
recursiveRules = [fullwidthCJKsymCJK, fullwidthCJKsym]
|
||||
|
||||
onepassRules :: RuleSet
|
||||
onepassRules = [anscjk, cjkans]
|
||||
|
||||
pangu :: Text -> Text
|
||||
pangu input = applyRules onepassRules $ applyRulesRecursively recursiveRules input
|
||||
Reference in New Issue
Block a user