From 0418e29edf4faff0d9ba91885ba99b79b9f07cf4 Mon Sep 17 00:00:00 2001 From: Yu Cong Date: Sat, 3 Jan 2026 17:03:47 +0800 Subject: [PATCH] z --- src/MyLib.hs | 99 +++++++++++++++++++++++++++++++++++++++++++++------- test/Main.hs | 28 +++++++++------ 2 files changed, 103 insertions(+), 24 deletions(-) diff --git a/src/MyLib.hs b/src/MyLib.hs index 3ab9913..57656b5 100644 --- a/src/MyLib.hs +++ b/src/MyLib.hs @@ -21,16 +21,29 @@ applyUntilFixed :: Rule -> Text -> Text applyUntilFixed rule = fix ( \loop current -> - let next = streamEdit rule id current + let next = streamEdit (try rule) id current in if next == current then next else loop next ) +applyRulesRecursively :: RuleSet -> Text -> Text +applyRulesRecursively rules input = foldl (flip applyUntilFixed) input rules + applyRules :: RuleSet -> Text -> Text -applyRules rules input = foldl (flip applyUntilFixed) input rules +applyRules rules input = foldl (flip applyOnce) input rules + where + applyOnce rule = streamEdit (try rule) id ------------------------------------------------------------------------------- -- rules for pangu +-- alphaNumChar from megaparsec matches CJK chars... +-- need to implement a new one +alphanumericChar :: Parser Char +alphanumericChar = satisfy $ \c -> + (c >= 'a' && c <= 'z') + || (c >= 'A' && c <= 'Z') + || (c >= '0' && c <= '9') + -- | Check if a character falls within the CJK ranges provided isCJK :: Char -> Bool isCJK c = any (\(start, end) -> c >= start && c <= end) cjkRanges @@ -97,7 +110,7 @@ fixCJKcolAN :: Rule fixCJKcolAN = do cjk <- cjkChar _ <- char ':' - an <- alphaNumChar + an <- alphanumericChar return $ T.pack $ [cjk] ++ ":" ++ [an] -- quotes @@ -143,24 +156,76 @@ singlequoteCJK = do fixPossessivequote :: Rule fixPossessivequote = do - pre <- cjkChar <|> alphaNumChar + pre <- cjkChar <|> alphanumericChar _ <- some spaceChar _ <- chunk "'s" return $ T.pack $ pre : "'s" -- hash --- hashANSCJKhash :: Rule --- hashANSCJKhash = do --- cjk1 <- cjkChar --- _ <- char '#' +hashANSCJKhash :: Rule +hashANSCJKhash = do + cjk1 <- cjkChar + _ <- char '#' + mid <- some cjkChar + _ <- char '#' + cjk2 <- cjkChar + return $ T.pack $ [cjk1] ++ " #" ++ mid ++ "# " ++ [cjk2] +cjkhash :: Rule +cjkhash = do + cjk <- cjkChar + _ <- char '#' + _ <- lookAhead $ anySingleBut ' ' + return $ T.pack $ cjk : " #" + +hashcjk :: Rule +hashcjk = do + _ <- char '#' + _ <- lookAhead $ anySingleBut ' ' + cjk <- cjkChar + return $ T.pack $ "# " ++ [cjk] + +-- operators +cjkOPTan :: Rule +cjkOPTan = do + cjk <- cjkChar + opt <- oneOf ("+-=*/&|<>%" :: [Char]) + an <- alphanumericChar + return $ T.pack [cjk, ' ', opt, ' ', an] + +anOPTcjk :: Rule +anOPTcjk = do + an <- alphanumericChar + opt <- oneOf ("+-=*/&|<>%" :: [Char]) + cjk <- cjkChar + return $ T.pack [an, ' ', opt, ' ', cjk] + +-- slash/bracket rules are not implemented + +-- CJK and alphanumeric without space + +cjkans :: Rule +cjkans = do + cjk <- cjkChar + _ <- lookAhead (alphanumericChar <|> oneOf ("@$%^&*-+\\=|/" :: [Char])) + return $ T.pack [cjk, ' '] + +anscjk :: Rule +anscjk = do + an <- alphanumericChar <|> oneOf ("~!$%^&*-+\\=|;:,./?" :: [Char]) + _ <- lookAhead cjkChar + return $ T.pack [an, ' '] -- rule set, the order matters -myRules :: RuleSet -myRules = +recursiveRules :: RuleSet +recursiveRules = [ fullwidthCJKsymCJK, - fullwidthCJKsym, - dotsCJK, + fullwidthCJKsym + ] + +onepassRules :: RuleSet +onepassRules = + [ dotsCJK, fixCJKcolAN, cjkquote, quoteCJK, @@ -168,5 +233,13 @@ myRules = cjkpossessivequote, -- singlequoteCJK, fixPossessivequote, + hashANSCJKhash, + cjkhash, + -- hashcjk, + anscjk, + cjkans, empty -- a dummy rule - ] \ No newline at end of file + ] + +pangu :: Text -> Text +pangu input = applyRules onepassRules $ applyRulesRecursively recursiveRules input \ No newline at end of file diff --git a/test/Main.hs b/test/Main.hs index 60edf4c..2b363cb 100644 --- a/test/Main.hs +++ b/test/Main.hs @@ -8,15 +8,21 @@ main :: IO () main = hspec $ do describe "MyLib.cjksym(cjk)" $ do it "converts symbols to fullwidth" $ do - applyRules myRules "你 : 好" `shouldBe` "你:好" - applyRules myRules "你.好" `shouldBe` "你。好" - applyRules myRules "你:好:他" `shouldBe` "你:好:他" - applyRules myRules "你 ? 好" `shouldBe` "你?好" - applyRules myRules "你…好" `shouldBe` "你… 好" - applyRules myRules "你...好" `shouldBe` "你... 好" - applyRules myRules "你:0" `shouldBe` "你:0" + pangu "你 : 好" `shouldBe` "你:好" + pangu "你.好" `shouldBe` "你。好" + pangu "你:好:他" `shouldBe` "你:好:他" + pangu "你 ? 好" `shouldBe` "你?好" + pangu "你…好" `shouldBe` "你… 好" + pangu "你...好" `shouldBe` "你... 好" + pangu "你:0" `shouldBe` "你:0" it "fixes quotes" $ do - applyRules myRules "我说:\" 他说:'你好'\"" `shouldBe` "我说:\"他说:' 你好 '\"" - -- applyRules myRules "'你好'" `shouldBe` "' 你好'" -- strange behavior - applyRules myRules "你'hello'" `shouldBe` "你 'hello'" - applyRules myRules "我 's " `shouldBe` "我's " \ No newline at end of file + pangu "我说:\" 他说:'你好'\"" `shouldBe` "我说:\"他说:' 你好 '\"" + -- pangu "'你好'" `shouldBe` "' 你好'" -- strange behavior + pangu "你'hello'" `shouldBe` "你 'hello'" + pangu "我 's " `shouldBe` "我's " + it "fixes hash" $ do + pangu "你好#测试#世界" `shouldBe` "你好 #测试# 世界" + it "add spaces" $ do + pangu "你好and世界" `shouldBe` "你好 and 世界" + pangu "當你凝視著bug,bug也凝視著你" `shouldBe` "當你凝視著 bug,bug 也凝視著你" + pangu "與PM戰鬥的人,應當小心自己不要成為PM" `shouldBe` "與 PM 戰鬥的人,應當小心自己不要成為 PM" \ No newline at end of file