This commit is contained in:
2026-01-03 17:03:47 +08:00
parent 5ba00b7fc9
commit 0418e29edf
2 changed files with 103 additions and 24 deletions

View File

@@ -21,16 +21,29 @@ applyUntilFixed :: Rule -> Text -> Text
applyUntilFixed rule =
fix
( \loop current ->
let next = streamEdit rule id current
let next = streamEdit (try rule) id current
in if next == current then next else loop next
)
applyRulesRecursively :: RuleSet -> Text -> Text
applyRulesRecursively rules input = foldl (flip applyUntilFixed) input rules
applyRules :: RuleSet -> Text -> Text
applyRules rules input = foldl (flip applyUntilFixed) input rules
applyRules rules input = foldl (flip applyOnce) input rules
where
applyOnce rule = streamEdit (try rule) id
-------------------------------------------------------------------------------
-- rules for pangu
-- alphaNumChar from megaparsec matches CJK chars...
-- need to implement a new one
alphanumericChar :: Parser Char
alphanumericChar = satisfy $ \c ->
(c >= 'a' && c <= 'z')
|| (c >= 'A' && c <= 'Z')
|| (c >= '0' && c <= '9')
-- | Check if a character falls within the CJK ranges provided
isCJK :: Char -> Bool
isCJK c = any (\(start, end) -> c >= start && c <= end) cjkRanges
@@ -97,7 +110,7 @@ fixCJKcolAN :: Rule
fixCJKcolAN = do
cjk <- cjkChar
_ <- char ':'
an <- alphaNumChar
an <- alphanumericChar
return $ T.pack $ [cjk] ++ "" ++ [an]
-- quotes
@@ -143,24 +156,76 @@ singlequoteCJK = do
fixPossessivequote :: Rule
fixPossessivequote = do
pre <- cjkChar <|> alphaNumChar
pre <- cjkChar <|> alphanumericChar
_ <- some spaceChar
_ <- chunk "'s"
return $ T.pack $ pre : "'s"
-- hash
-- hashANSCJKhash :: Rule
-- hashANSCJKhash = do
-- cjk1 <- cjkChar
-- _ <- char '#'
hashANSCJKhash :: Rule
hashANSCJKhash = do
cjk1 <- cjkChar
_ <- char '#'
mid <- some cjkChar
_ <- char '#'
cjk2 <- cjkChar
return $ T.pack $ [cjk1] ++ " #" ++ mid ++ "# " ++ [cjk2]
cjkhash :: Rule
cjkhash = do
cjk <- cjkChar
_ <- char '#'
_ <- lookAhead $ anySingleBut ' '
return $ T.pack $ cjk : " #"
hashcjk :: Rule
hashcjk = do
_ <- char '#'
_ <- lookAhead $ anySingleBut ' '
cjk <- cjkChar
return $ T.pack $ "# " ++ [cjk]
-- operators
cjkOPTan :: Rule
cjkOPTan = do
cjk <- cjkChar
opt <- oneOf ("+-=*/&|<>%" :: [Char])
an <- alphanumericChar
return $ T.pack [cjk, ' ', opt, ' ', an]
anOPTcjk :: Rule
anOPTcjk = do
an <- alphanumericChar
opt <- oneOf ("+-=*/&|<>%" :: [Char])
cjk <- cjkChar
return $ T.pack [an, ' ', opt, ' ', cjk]
-- slash/bracket rules are not implemented
-- CJK and alphanumeric without space
cjkans :: Rule
cjkans = do
cjk <- cjkChar
_ <- lookAhead (alphanumericChar <|> oneOf ("@$%^&*-+\\=|/" :: [Char]))
return $ T.pack [cjk, ' ']
anscjk :: Rule
anscjk = do
an <- alphanumericChar <|> oneOf ("~!$%^&*-+\\=|;:,./?" :: [Char])
_ <- lookAhead cjkChar
return $ T.pack [an, ' ']
-- rule set, the order matters
myRules :: RuleSet
myRules =
recursiveRules :: RuleSet
recursiveRules =
[ fullwidthCJKsymCJK,
fullwidthCJKsym,
dotsCJK,
fullwidthCJKsym
]
onepassRules :: RuleSet
onepassRules =
[ dotsCJK,
fixCJKcolAN,
cjkquote,
quoteCJK,
@@ -168,5 +233,13 @@ myRules =
cjkpossessivequote,
-- singlequoteCJK,
fixPossessivequote,
hashANSCJKhash,
cjkhash,
-- hashcjk,
anscjk,
cjkans,
empty -- a dummy rule
]
]
pangu :: Text -> Text
pangu input = applyRules onepassRules $ applyRulesRecursively recursiveRules input

View File

@@ -8,15 +8,21 @@ main :: IO ()
main = hspec $ do
describe "MyLib.cjksym(cjk)" $ do
it "converts symbols to fullwidth" $ do
applyRules myRules "你 : 好" `shouldBe` "你:好"
applyRules myRules "你.好" `shouldBe` "你。好"
applyRules myRules "你:好:他" `shouldBe` "你:好:他"
applyRules myRules "你 ? 好" `shouldBe` "你?好"
applyRules myRules "你…好" `shouldBe` "你… 好"
applyRules myRules "你...好" `shouldBe` "你... 好"
applyRules myRules "你:0" `shouldBe` "0"
pangu "你 : 好" `shouldBe` "你:好"
pangu "你.好" `shouldBe` "你。好"
pangu "你:好:他" `shouldBe` "你:好:他"
pangu "你 ? 好" `shouldBe` "你?好"
pangu "你…好" `shouldBe` "你… 好"
pangu "你...好" `shouldBe` "你... 好"
pangu "你:0" `shouldBe` "0"
it "fixes quotes" $ do
applyRules myRules "我说:\" 他说:'你好'\"" `shouldBe` "我说:\"他说:' 你好 '\""
-- applyRules myRules "'你好'" `shouldBe` "' 你好'" -- strange behavior
applyRules myRules "你'hello'" `shouldBe` "你 'hello'"
applyRules myRules "我 's " `shouldBe` "我's "
pangu "我说:\" 他说:'你好'\"" `shouldBe` "我说:\"他说:' 你好 '\""
-- pangu "'你好'" `shouldBe` "' 你好'" -- strange behavior
pangu "你'hello'" `shouldBe` "你 'hello'"
pangu "我 's " `shouldBe` "我's "
it "fixes hash" $ do
pangu "你好#测试#世界" `shouldBe` "你好 #测试# 世界"
it "add spaces" $ do
pangu "你好and世界" `shouldBe` "你好 and 世界"
pangu "當你凝視著bugbug也凝視著你" `shouldBe` "當你凝視著 bugbug 也凝視著你"
pangu "與PM戰鬥的人,應當小心自己不要成為PM" `shouldBe` "與 PM 戰鬥的人,應當小心自己不要成為 PM"