Compare commits
2 Commits
792bbc80c6
...
0418e29edf
| Author | SHA1 | Date | |
|---|---|---|---|
| 0418e29edf | |||
| 5ba00b7fc9 |
129
src/MyLib.hs
129
src/MyLib.hs
@@ -21,16 +21,29 @@ applyUntilFixed :: Rule -> Text -> Text
|
||||
applyUntilFixed rule =
|
||||
fix
|
||||
( \loop current ->
|
||||
let next = streamEdit rule id current
|
||||
let next = streamEdit (try rule) id current
|
||||
in if next == current then next else loop next
|
||||
)
|
||||
|
||||
applyRulesRecursively :: RuleSet -> Text -> Text
|
||||
applyRulesRecursively rules input = foldl (flip applyUntilFixed) input rules
|
||||
|
||||
applyRules :: RuleSet -> Text -> Text
|
||||
applyRules rules input = foldl (flip applyUntilFixed) input rules
|
||||
applyRules rules input = foldl (flip applyOnce) input rules
|
||||
where
|
||||
applyOnce rule = streamEdit (try rule) id
|
||||
|
||||
-------------------------------------------------------------------------------
|
||||
-- rules for pangu
|
||||
|
||||
-- alphaNumChar from megaparsec matches CJK chars...
|
||||
-- need to implement a new one
|
||||
alphanumericChar :: Parser Char
|
||||
alphanumericChar = satisfy $ \c ->
|
||||
(c >= 'a' && c <= 'z')
|
||||
|| (c >= 'A' && c <= 'Z')
|
||||
|| (c >= '0' && c <= '9')
|
||||
|
||||
-- | Check if a character falls within the CJK ranges provided
|
||||
isCJK :: Char -> Bool
|
||||
isCJK c = any (\(start, end) -> c >= start && c <= end) cjkRanges
|
||||
@@ -96,14 +109,14 @@ dotsCJK = do
|
||||
fixCJKcolAN :: Rule
|
||||
fixCJKcolAN = do
|
||||
cjk <- cjkChar
|
||||
_ <- chunk ":"
|
||||
an <- alphaNumChar
|
||||
_ <- char ':'
|
||||
an <- alphanumericChar
|
||||
return $ T.pack $ [cjk] ++ ":" ++ [an]
|
||||
|
||||
-- quotes
|
||||
-- seems confusing ...
|
||||
quotesym :: [Char]
|
||||
quotesym = "\x05f4\"\'`"
|
||||
quotesym = "'`\x05f4\""
|
||||
|
||||
cjkquote :: Rule
|
||||
cjkquote = do
|
||||
@@ -125,14 +138,108 @@ fixQuote = do
|
||||
closeQuotes <- T.pack <$> some (oneOf quotesym)
|
||||
return $ openQuotes <> T.strip content <> closeQuotes
|
||||
|
||||
-- the rule set
|
||||
myRules :: RuleSet
|
||||
myRules =
|
||||
cjkpossessivequote :: Rule
|
||||
cjkpossessivequote = do
|
||||
cjk <- cjkChar
|
||||
_ <- char '\''
|
||||
_ <- lookAhead $ anySingleBut 's'
|
||||
return $ T.pack $ cjk : " '"
|
||||
|
||||
-- This singlequoteCJK rule will turn '你好' into ' 你好'
|
||||
-- which seems not desirable...
|
||||
-- however, the behavior is aligned with python version
|
||||
singlequoteCJK :: Rule
|
||||
singlequoteCJK = do
|
||||
_ <- char '\''
|
||||
cjk <- cjkChar
|
||||
return $ T.pack $ "' " ++ [cjk]
|
||||
|
||||
fixPossessivequote :: Rule
|
||||
fixPossessivequote = do
|
||||
pre <- cjkChar <|> alphanumericChar
|
||||
_ <- some spaceChar
|
||||
_ <- chunk "'s"
|
||||
return $ T.pack $ pre : "'s"
|
||||
|
||||
-- hash
|
||||
hashANSCJKhash :: Rule
|
||||
hashANSCJKhash = do
|
||||
cjk1 <- cjkChar
|
||||
_ <- char '#'
|
||||
mid <- some cjkChar
|
||||
_ <- char '#'
|
||||
cjk2 <- cjkChar
|
||||
return $ T.pack $ [cjk1] ++ " #" ++ mid ++ "# " ++ [cjk2]
|
||||
|
||||
cjkhash :: Rule
|
||||
cjkhash = do
|
||||
cjk <- cjkChar
|
||||
_ <- char '#'
|
||||
_ <- lookAhead $ anySingleBut ' '
|
||||
return $ T.pack $ cjk : " #"
|
||||
|
||||
hashcjk :: Rule
|
||||
hashcjk = do
|
||||
_ <- char '#'
|
||||
_ <- lookAhead $ anySingleBut ' '
|
||||
cjk <- cjkChar
|
||||
return $ T.pack $ "# " ++ [cjk]
|
||||
|
||||
-- operators
|
||||
cjkOPTan :: Rule
|
||||
cjkOPTan = do
|
||||
cjk <- cjkChar
|
||||
opt <- oneOf ("+-=*/&|<>%" :: [Char])
|
||||
an <- alphanumericChar
|
||||
return $ T.pack [cjk, ' ', opt, ' ', an]
|
||||
|
||||
anOPTcjk :: Rule
|
||||
anOPTcjk = do
|
||||
an <- alphanumericChar
|
||||
opt <- oneOf ("+-=*/&|<>%" :: [Char])
|
||||
cjk <- cjkChar
|
||||
return $ T.pack [an, ' ', opt, ' ', cjk]
|
||||
|
||||
-- slash/bracket rules are not implemented
|
||||
|
||||
-- CJK and alphanumeric without space
|
||||
|
||||
cjkans :: Rule
|
||||
cjkans = do
|
||||
cjk <- cjkChar
|
||||
_ <- lookAhead (alphanumericChar <|> oneOf ("@$%^&*-+\\=|/" :: [Char]))
|
||||
return $ T.pack [cjk, ' ']
|
||||
|
||||
anscjk :: Rule
|
||||
anscjk = do
|
||||
an <- alphanumericChar <|> oneOf ("~!$%^&*-+\\=|;:,./?" :: [Char])
|
||||
_ <- lookAhead cjkChar
|
||||
return $ T.pack [an, ' ']
|
||||
|
||||
-- rule set, the order matters
|
||||
recursiveRules :: RuleSet
|
||||
recursiveRules =
|
||||
[ fullwidthCJKsymCJK,
|
||||
fullwidthCJKsym,
|
||||
dotsCJK,
|
||||
fullwidthCJKsym
|
||||
]
|
||||
|
||||
onepassRules :: RuleSet
|
||||
onepassRules =
|
||||
[ dotsCJK,
|
||||
fixCJKcolAN,
|
||||
cjkquote,
|
||||
quoteCJK,
|
||||
fixQuote
|
||||
fixQuote,
|
||||
cjkpossessivequote,
|
||||
-- singlequoteCJK,
|
||||
fixPossessivequote,
|
||||
hashANSCJKhash,
|
||||
cjkhash,
|
||||
-- hashcjk,
|
||||
anscjk,
|
||||
cjkans,
|
||||
empty -- a dummy rule
|
||||
]
|
||||
|
||||
pangu :: Text -> Text
|
||||
pangu input = applyRules onepassRules $ applyRulesRecursively recursiveRules input
|
||||
29
test/Main.hs
29
test/Main.hs
@@ -8,14 +8,21 @@ main :: IO ()
|
||||
main = hspec $ do
|
||||
describe "MyLib.cjksym(cjk)" $ do
|
||||
it "converts symbols to fullwidth" $ do
|
||||
applyRules myRules "你 : 好" `shouldBe` "你:好"
|
||||
applyRules myRules "你.好" `shouldBe` "你。好"
|
||||
applyRules myRules "你:好:他" `shouldBe` "你:好:他"
|
||||
applyRules myRules "你 ? 好" `shouldBe` "你?好"
|
||||
applyRules myRules "你…好" `shouldBe` "你… 好"
|
||||
applyRules myRules "你...好" `shouldBe` "你... 好"
|
||||
applyRules myRules "你:0" `shouldBe` "你:0"
|
||||
applyRules myRules "我说:\" 他说:\'你好\'\"" `shouldBe` "我说:\"他说:\'你好\'\""
|
||||
it "adds spaces" $ do
|
||||
applyRules myRules "\'你好\'" `shouldBe` "\'你好\'"
|
||||
applyRules myRules "你\'hello\'" `shouldBe` "你 \'hello\'"
|
||||
pangu "你 : 好" `shouldBe` "你:好"
|
||||
pangu "你.好" `shouldBe` "你。好"
|
||||
pangu "你:好:他" `shouldBe` "你:好:他"
|
||||
pangu "你 ? 好" `shouldBe` "你?好"
|
||||
pangu "你…好" `shouldBe` "你… 好"
|
||||
pangu "你...好" `shouldBe` "你... 好"
|
||||
pangu "你:0" `shouldBe` "你:0"
|
||||
it "fixes quotes" $ do
|
||||
pangu "我说:\" 他说:'你好'\"" `shouldBe` "我说:\"他说:' 你好 '\""
|
||||
-- pangu "'你好'" `shouldBe` "' 你好'" -- strange behavior
|
||||
pangu "你'hello'" `shouldBe` "你 'hello'"
|
||||
pangu "我 's " `shouldBe` "我's "
|
||||
it "fixes hash" $ do
|
||||
pangu "你好#测试#世界" `shouldBe` "你好 #测试# 世界"
|
||||
it "add spaces" $ do
|
||||
pangu "你好and世界" `shouldBe` "你好 and 世界"
|
||||
pangu "當你凝視著bug,bug也凝視著你" `shouldBe` "當你凝視著 bug,bug 也凝視著你"
|
||||
pangu "與PM戰鬥的人,應當小心自己不要成為PM" `shouldBe` "與 PM 戰鬥的人,應當小心自己不要成為 PM"
|
||||
Reference in New Issue
Block a user