Compare commits

..

2 Commits

Author SHA1 Message Date
0418e29edf z 2026-01-03 17:03:47 +08:00
5ba00b7fc9 Refactor quote handling and add new rules for possessive quotes 2026-01-03 15:14:31 +08:00
2 changed files with 139 additions and 25 deletions

View File

@@ -21,16 +21,29 @@ applyUntilFixed :: Rule -> Text -> Text
applyUntilFixed rule =
fix
( \loop current ->
let next = streamEdit rule id current
let next = streamEdit (try rule) id current
in if next == current then next else loop next
)
applyRulesRecursively :: RuleSet -> Text -> Text
applyRulesRecursively rules input = foldl (flip applyUntilFixed) input rules
applyRules :: RuleSet -> Text -> Text
applyRules rules input = foldl (flip applyUntilFixed) input rules
applyRules rules input = foldl (flip applyOnce) input rules
where
applyOnce rule = streamEdit (try rule) id
-------------------------------------------------------------------------------
-- rules for pangu
-- alphaNumChar from megaparsec matches CJK chars...
-- need to implement a new one
alphanumericChar :: Parser Char
alphanumericChar = satisfy $ \c ->
(c >= 'a' && c <= 'z')
|| (c >= 'A' && c <= 'Z')
|| (c >= '0' && c <= '9')
-- | Check if a character falls within the CJK ranges provided
isCJK :: Char -> Bool
isCJK c = any (\(start, end) -> c >= start && c <= end) cjkRanges
@@ -96,14 +109,14 @@ dotsCJK = do
fixCJKcolAN :: Rule
fixCJKcolAN = do
cjk <- cjkChar
_ <- chunk ":"
an <- alphaNumChar
_ <- char ':'
an <- alphanumericChar
return $ T.pack $ [cjk] ++ "" ++ [an]
-- quotes
-- seems confusing ...
quotesym :: [Char]
quotesym = "\x05f4\"\'`"
quotesym = "'`\x05f4\""
cjkquote :: Rule
cjkquote = do
@@ -125,14 +138,108 @@ fixQuote = do
closeQuotes <- T.pack <$> some (oneOf quotesym)
return $ openQuotes <> T.strip content <> closeQuotes
-- the rule set
myRules :: RuleSet
myRules =
cjkpossessivequote :: Rule
cjkpossessivequote = do
cjk <- cjkChar
_ <- char '\''
_ <- lookAhead $ anySingleBut 's'
return $ T.pack $ cjk : " '"
-- This singlequoteCJK rule will turn '你好' into ' 你好'
-- which seems not desirable...
-- however, the behavior is aligned with python version
singlequoteCJK :: Rule
singlequoteCJK = do
_ <- char '\''
cjk <- cjkChar
return $ T.pack $ "' " ++ [cjk]
fixPossessivequote :: Rule
fixPossessivequote = do
pre <- cjkChar <|> alphanumericChar
_ <- some spaceChar
_ <- chunk "'s"
return $ T.pack $ pre : "'s"
-- hash
hashANSCJKhash :: Rule
hashANSCJKhash = do
cjk1 <- cjkChar
_ <- char '#'
mid <- some cjkChar
_ <- char '#'
cjk2 <- cjkChar
return $ T.pack $ [cjk1] ++ " #" ++ mid ++ "# " ++ [cjk2]
cjkhash :: Rule
cjkhash = do
cjk <- cjkChar
_ <- char '#'
_ <- lookAhead $ anySingleBut ' '
return $ T.pack $ cjk : " #"
hashcjk :: Rule
hashcjk = do
_ <- char '#'
_ <- lookAhead $ anySingleBut ' '
cjk <- cjkChar
return $ T.pack $ "# " ++ [cjk]
-- operators
cjkOPTan :: Rule
cjkOPTan = do
cjk <- cjkChar
opt <- oneOf ("+-=*/&|<>%" :: [Char])
an <- alphanumericChar
return $ T.pack [cjk, ' ', opt, ' ', an]
anOPTcjk :: Rule
anOPTcjk = do
an <- alphanumericChar
opt <- oneOf ("+-=*/&|<>%" :: [Char])
cjk <- cjkChar
return $ T.pack [an, ' ', opt, ' ', cjk]
-- slash/bracket rules are not implemented
-- CJK and alphanumeric without space
cjkans :: Rule
cjkans = do
cjk <- cjkChar
_ <- lookAhead (alphanumericChar <|> oneOf ("@$%^&*-+\\=|/" :: [Char]))
return $ T.pack [cjk, ' ']
anscjk :: Rule
anscjk = do
an <- alphanumericChar <|> oneOf ("~!$%^&*-+\\=|;:,./?" :: [Char])
_ <- lookAhead cjkChar
return $ T.pack [an, ' ']
-- rule set, the order matters
recursiveRules :: RuleSet
recursiveRules =
[ fullwidthCJKsymCJK,
fullwidthCJKsym,
dotsCJK,
fullwidthCJKsym
]
onepassRules :: RuleSet
onepassRules =
[ dotsCJK,
fixCJKcolAN,
cjkquote,
quoteCJK,
fixQuote
fixQuote,
cjkpossessivequote,
-- singlequoteCJK,
fixPossessivequote,
hashANSCJKhash,
cjkhash,
-- hashcjk,
anscjk,
cjkans,
empty -- a dummy rule
]
pangu :: Text -> Text
pangu input = applyRules onepassRules $ applyRulesRecursively recursiveRules input

View File

@@ -8,14 +8,21 @@ main :: IO ()
main = hspec $ do
describe "MyLib.cjksym(cjk)" $ do
it "converts symbols to fullwidth" $ do
applyRules myRules "你 : 好" `shouldBe` "你:好"
applyRules myRules "你.好" `shouldBe` "你。好"
applyRules myRules "你:好:他" `shouldBe` "你:好:他"
applyRules myRules "你 ? 好" `shouldBe` "你?好"
applyRules myRules "你…好" `shouldBe` "你… 好"
applyRules myRules "你...好" `shouldBe` "你... 好"
applyRules myRules "你:0" `shouldBe` "0"
applyRules myRules "我说:\" 他说:\'你好\'\"" `shouldBe` "我说:\"他说:\'你好\'\""
it "adds spaces" $ do
applyRules myRules "\'你好\'" `shouldBe` "\'你好\'"
applyRules myRules "\'hello\'" `shouldBe` "\'hello\'"
pangu "你 : 好" `shouldBe` "你:好"
pangu "你.好" `shouldBe` "你。好"
pangu "你:好:他" `shouldBe` "你:好:他"
pangu "你 ? 好" `shouldBe` "你?好"
pangu "你…好" `shouldBe` "你… 好"
pangu "你...好" `shouldBe` "你... 好"
pangu "你:0" `shouldBe` "0"
it "fixes quotes" $ do
pangu "我说:\" 他说:'你好'\"" `shouldBe` "我说:\"他说:' 你好 '\""
-- pangu "'你好'" `shouldBe` "' 你好'" -- strange behavior
pangu "你'hello'" `shouldBe` "'hello'"
pangu "我 's " `shouldBe` "我's "
it "fixes hash" $ do
pangu "你好#测试#世界" `shouldBe` "你好 #测试# 世界"
it "add spaces" $ do
pangu "你好and世界" `shouldBe` "你好 and 世界"
pangu "當你凝視著bugbug也凝視著你" `shouldBe` "當你凝視著 bugbug 也凝視著你"
pangu "與PM戰鬥的人,應當小心自己不要成為PM" `shouldBe` "與 PM 戰鬥的人,應當小心自己不要成為 PM"