From 5ba00b7fc92eedd4a5812e2d3af0920e9aab553b Mon Sep 17 00:00:00 2001 From: Yu Cong Date: Sat, 3 Jan 2026 15:14:31 +0800 Subject: [PATCH] Refactor quote handling and add new rules for possessive quotes --- src/MyLib.hs | 46 ++++++++++++++++++++++++++++++++++++++++------ test/Main.hs | 9 +++++---- 2 files changed, 45 insertions(+), 10 deletions(-) diff --git a/src/MyLib.hs b/src/MyLib.hs index a868351..3ab9913 100644 --- a/src/MyLib.hs +++ b/src/MyLib.hs @@ -75,7 +75,7 @@ fullwidthCJKsymCJK = do sym <- try (some (char ':')) <|> count 1 (char '.') _ <- many (char ' ') rcjk <- cjkChar - let transformedsym = map convertToFullwidth sym + let transformedsym = map convertToFullwidth sym return $ T.pack $ [lcjk] ++ transformedsym ++ [rcjk] fullwidthCJKsym :: Rule @@ -96,14 +96,14 @@ dotsCJK = do fixCJKcolAN :: Rule fixCJKcolAN = do cjk <- cjkChar - _ <- chunk ":" + _ <- char ':' an <- alphaNumChar return $ T.pack $ [cjk] ++ ":" ++ [an] -- quotes -- seems confusing ... quotesym :: [Char] -quotesym = "\x05f4\"\'`" +quotesym = "'`\x05f4\"" cjkquote :: Rule cjkquote = do @@ -122,10 +122,40 @@ fixQuote = do openQuotes <- T.pack <$> some (oneOf quotesym) _ <- many spaceChar content <- T.pack <$> someTill anySingle (lookAhead $ some (oneOf quotesym)) - closeQuotes <- T.pack <$> some (oneOf quotesym) + closeQuotes <- T.pack <$> some (oneOf quotesym) return $ openQuotes <> T.strip content <> closeQuotes --- the rule set +cjkpossessivequote :: Rule +cjkpossessivequote = do + cjk <- cjkChar + _ <- char '\'' + _ <- lookAhead $ anySingleBut 's' + return $ T.pack $ cjk : " '" + +-- This singlequoteCJK rule will turn '你好' into ' 你好' +-- which seems not desirable... +-- however, the behavior is aligned with python version +singlequoteCJK :: Rule +singlequoteCJK = do + _ <- char '\'' + cjk <- cjkChar + return $ T.pack $ "' " ++ [cjk] + +fixPossessivequote :: Rule +fixPossessivequote = do + pre <- cjkChar <|> alphaNumChar + _ <- some spaceChar + _ <- chunk "'s" + return $ T.pack $ pre : "'s" + +-- hash +-- hashANSCJKhash :: Rule +-- hashANSCJKhash = do +-- cjk1 <- cjkChar +-- _ <- char '#' + + +-- rule set, the order matters myRules :: RuleSet myRules = [ fullwidthCJKsymCJK, @@ -134,5 +164,9 @@ myRules = fixCJKcolAN, cjkquote, quoteCJK, - fixQuote + fixQuote, + cjkpossessivequote, + -- singlequoteCJK, + fixPossessivequote, + empty -- a dummy rule ] \ No newline at end of file diff --git a/test/Main.hs b/test/Main.hs index 9db4b1e..60edf4c 100644 --- a/test/Main.hs +++ b/test/Main.hs @@ -15,7 +15,8 @@ main = hspec $ do applyRules myRules "你…好" `shouldBe` "你… 好" applyRules myRules "你...好" `shouldBe` "你... 好" applyRules myRules "你:0" `shouldBe` "你:0" - applyRules myRules "我说:\" 他说:\'你好\'\"" `shouldBe` "我说:\"他说:\'你好\'\"" - it "adds spaces" $ do - applyRules myRules "\'你好\'" `shouldBe` "\'你好\'" - applyRules myRules "你\'hello\'" `shouldBe` "你 \'hello\'" \ No newline at end of file + it "fixes quotes" $ do + applyRules myRules "我说:\" 他说:'你好'\"" `shouldBe` "我说:\"他说:' 你好 '\"" + -- applyRules myRules "'你好'" `shouldBe` "' 你好'" -- strange behavior + applyRules myRules "你'hello'" `shouldBe` "你 'hello'" + applyRules myRules "我 's " `shouldBe` "我's " \ No newline at end of file