This commit is contained in:
2026-01-03 17:03:47 +08:00
parent 5ba00b7fc9
commit 0418e29edf
2 changed files with 103 additions and 24 deletions

View File

@@ -21,16 +21,29 @@ applyUntilFixed :: Rule -> Text -> Text
applyUntilFixed rule = applyUntilFixed rule =
fix fix
( \loop current -> ( \loop current ->
let next = streamEdit rule id current let next = streamEdit (try rule) id current
in if next == current then next else loop next in if next == current then next else loop next
) )
applyRulesRecursively :: RuleSet -> Text -> Text
applyRulesRecursively rules input = foldl (flip applyUntilFixed) input rules
applyRules :: RuleSet -> Text -> Text applyRules :: RuleSet -> Text -> Text
applyRules rules input = foldl (flip applyUntilFixed) input rules applyRules rules input = foldl (flip applyOnce) input rules
where
applyOnce rule = streamEdit (try rule) id
------------------------------------------------------------------------------- -------------------------------------------------------------------------------
-- rules for pangu -- rules for pangu
-- alphaNumChar from megaparsec matches CJK chars...
-- need to implement a new one
alphanumericChar :: Parser Char
alphanumericChar = satisfy $ \c ->
(c >= 'a' && c <= 'z')
|| (c >= 'A' && c <= 'Z')
|| (c >= '0' && c <= '9')
-- | Check if a character falls within the CJK ranges provided -- | Check if a character falls within the CJK ranges provided
isCJK :: Char -> Bool isCJK :: Char -> Bool
isCJK c = any (\(start, end) -> c >= start && c <= end) cjkRanges isCJK c = any (\(start, end) -> c >= start && c <= end) cjkRanges
@@ -97,7 +110,7 @@ fixCJKcolAN :: Rule
fixCJKcolAN = do fixCJKcolAN = do
cjk <- cjkChar cjk <- cjkChar
_ <- char ':' _ <- char ':'
an <- alphaNumChar an <- alphanumericChar
return $ T.pack $ [cjk] ++ "" ++ [an] return $ T.pack $ [cjk] ++ "" ++ [an]
-- quotes -- quotes
@@ -143,24 +156,76 @@ singlequoteCJK = do
fixPossessivequote :: Rule fixPossessivequote :: Rule
fixPossessivequote = do fixPossessivequote = do
pre <- cjkChar <|> alphaNumChar pre <- cjkChar <|> alphanumericChar
_ <- some spaceChar _ <- some spaceChar
_ <- chunk "'s" _ <- chunk "'s"
return $ T.pack $ pre : "'s" return $ T.pack $ pre : "'s"
-- hash -- hash
-- hashANSCJKhash :: Rule hashANSCJKhash :: Rule
-- hashANSCJKhash = do hashANSCJKhash = do
-- cjk1 <- cjkChar cjk1 <- cjkChar
-- _ <- char '#' _ <- char '#'
mid <- some cjkChar
_ <- char '#'
cjk2 <- cjkChar
return $ T.pack $ [cjk1] ++ " #" ++ mid ++ "# " ++ [cjk2]
cjkhash :: Rule
cjkhash = do
cjk <- cjkChar
_ <- char '#'
_ <- lookAhead $ anySingleBut ' '
return $ T.pack $ cjk : " #"
hashcjk :: Rule
hashcjk = do
_ <- char '#'
_ <- lookAhead $ anySingleBut ' '
cjk <- cjkChar
return $ T.pack $ "# " ++ [cjk]
-- operators
cjkOPTan :: Rule
cjkOPTan = do
cjk <- cjkChar
opt <- oneOf ("+-=*/&|<>%" :: [Char])
an <- alphanumericChar
return $ T.pack [cjk, ' ', opt, ' ', an]
anOPTcjk :: Rule
anOPTcjk = do
an <- alphanumericChar
opt <- oneOf ("+-=*/&|<>%" :: [Char])
cjk <- cjkChar
return $ T.pack [an, ' ', opt, ' ', cjk]
-- slash/bracket rules are not implemented
-- CJK and alphanumeric without space
cjkans :: Rule
cjkans = do
cjk <- cjkChar
_ <- lookAhead (alphanumericChar <|> oneOf ("@$%^&*-+\\=|/" :: [Char]))
return $ T.pack [cjk, ' ']
anscjk :: Rule
anscjk = do
an <- alphanumericChar <|> oneOf ("~!$%^&*-+\\=|;:,./?" :: [Char])
_ <- lookAhead cjkChar
return $ T.pack [an, ' ']
-- rule set, the order matters -- rule set, the order matters
myRules :: RuleSet recursiveRules :: RuleSet
myRules = recursiveRules =
[ fullwidthCJKsymCJK, [ fullwidthCJKsymCJK,
fullwidthCJKsym, fullwidthCJKsym
dotsCJK, ]
onepassRules :: RuleSet
onepassRules =
[ dotsCJK,
fixCJKcolAN, fixCJKcolAN,
cjkquote, cjkquote,
quoteCJK, quoteCJK,
@@ -168,5 +233,13 @@ myRules =
cjkpossessivequote, cjkpossessivequote,
-- singlequoteCJK, -- singlequoteCJK,
fixPossessivequote, fixPossessivequote,
hashANSCJKhash,
cjkhash,
-- hashcjk,
anscjk,
cjkans,
empty -- a dummy rule empty -- a dummy rule
] ]
pangu :: Text -> Text
pangu input = applyRules onepassRules $ applyRulesRecursively recursiveRules input

View File

@@ -8,15 +8,21 @@ main :: IO ()
main = hspec $ do main = hspec $ do
describe "MyLib.cjksym(cjk)" $ do describe "MyLib.cjksym(cjk)" $ do
it "converts symbols to fullwidth" $ do it "converts symbols to fullwidth" $ do
applyRules myRules "你 : 好" `shouldBe` "你:好" pangu "你 : 好" `shouldBe` "你:好"
applyRules myRules "你.好" `shouldBe` "你。好" pangu "你.好" `shouldBe` "你。好"
applyRules myRules "你:好:他" `shouldBe` "你:好:他" pangu "你:好:他" `shouldBe` "你:好:他"
applyRules myRules "你 ? 好" `shouldBe` "你?好" pangu "你 ? 好" `shouldBe` "你?好"
applyRules myRules "你…好" `shouldBe` "你… 好" pangu "你…好" `shouldBe` "你… 好"
applyRules myRules "你...好" `shouldBe` "你... 好" pangu "你...好" `shouldBe` "你... 好"
applyRules myRules "你:0" `shouldBe` "0" pangu "你:0" `shouldBe` "0"
it "fixes quotes" $ do it "fixes quotes" $ do
applyRules myRules "我说:\" 他说:'你好'\"" `shouldBe` "我说:\"他说:' 你好 '\"" pangu "我说:\" 他说:'你好'\"" `shouldBe` "我说:\"他说:' 你好 '\""
-- applyRules myRules "'你好'" `shouldBe` "' 你好'" -- strange behavior -- pangu "'你好'" `shouldBe` "' 你好'" -- strange behavior
applyRules myRules "你'hello'" `shouldBe` "你 'hello'" pangu "你'hello'" `shouldBe` "你 'hello'"
applyRules myRules "我 's " `shouldBe` "我's " pangu "我 's " `shouldBe` "我's "
it "fixes hash" $ do
pangu "你好#测试#世界" `shouldBe` "你好 #测试# 世界"
it "add spaces" $ do
pangu "你好and世界" `shouldBe` "你好 and 世界"
pangu "當你凝視著bugbug也凝視著你" `shouldBe` "當你凝視著 bugbug 也凝視著你"
pangu "與PM戰鬥的人,應當小心自己不要成為PM" `shouldBe` "與 PM 戰鬥的人,應當小心自己不要成為 PM"