Compare commits
2 Commits
792bbc80c6
...
0418e29edf
| Author | SHA1 | Date | |
|---|---|---|---|
| 0418e29edf | |||
| 5ba00b7fc9 |
129
src/MyLib.hs
129
src/MyLib.hs
@@ -21,16 +21,29 @@ applyUntilFixed :: Rule -> Text -> Text
|
|||||||
applyUntilFixed rule =
|
applyUntilFixed rule =
|
||||||
fix
|
fix
|
||||||
( \loop current ->
|
( \loop current ->
|
||||||
let next = streamEdit rule id current
|
let next = streamEdit (try rule) id current
|
||||||
in if next == current then next else loop next
|
in if next == current then next else loop next
|
||||||
)
|
)
|
||||||
|
|
||||||
|
applyRulesRecursively :: RuleSet -> Text -> Text
|
||||||
|
applyRulesRecursively rules input = foldl (flip applyUntilFixed) input rules
|
||||||
|
|
||||||
applyRules :: RuleSet -> Text -> Text
|
applyRules :: RuleSet -> Text -> Text
|
||||||
applyRules rules input = foldl (flip applyUntilFixed) input rules
|
applyRules rules input = foldl (flip applyOnce) input rules
|
||||||
|
where
|
||||||
|
applyOnce rule = streamEdit (try rule) id
|
||||||
|
|
||||||
-------------------------------------------------------------------------------
|
-------------------------------------------------------------------------------
|
||||||
-- rules for pangu
|
-- rules for pangu
|
||||||
|
|
||||||
|
-- alphaNumChar from megaparsec matches CJK chars...
|
||||||
|
-- need to implement a new one
|
||||||
|
alphanumericChar :: Parser Char
|
||||||
|
alphanumericChar = satisfy $ \c ->
|
||||||
|
(c >= 'a' && c <= 'z')
|
||||||
|
|| (c >= 'A' && c <= 'Z')
|
||||||
|
|| (c >= '0' && c <= '9')
|
||||||
|
|
||||||
-- | Check if a character falls within the CJK ranges provided
|
-- | Check if a character falls within the CJK ranges provided
|
||||||
isCJK :: Char -> Bool
|
isCJK :: Char -> Bool
|
||||||
isCJK c = any (\(start, end) -> c >= start && c <= end) cjkRanges
|
isCJK c = any (\(start, end) -> c >= start && c <= end) cjkRanges
|
||||||
@@ -96,14 +109,14 @@ dotsCJK = do
|
|||||||
fixCJKcolAN :: Rule
|
fixCJKcolAN :: Rule
|
||||||
fixCJKcolAN = do
|
fixCJKcolAN = do
|
||||||
cjk <- cjkChar
|
cjk <- cjkChar
|
||||||
_ <- chunk ":"
|
_ <- char ':'
|
||||||
an <- alphaNumChar
|
an <- alphanumericChar
|
||||||
return $ T.pack $ [cjk] ++ ":" ++ [an]
|
return $ T.pack $ [cjk] ++ ":" ++ [an]
|
||||||
|
|
||||||
-- quotes
|
-- quotes
|
||||||
-- seems confusing ...
|
-- seems confusing ...
|
||||||
quotesym :: [Char]
|
quotesym :: [Char]
|
||||||
quotesym = "\x05f4\"\'`"
|
quotesym = "'`\x05f4\""
|
||||||
|
|
||||||
cjkquote :: Rule
|
cjkquote :: Rule
|
||||||
cjkquote = do
|
cjkquote = do
|
||||||
@@ -125,14 +138,108 @@ fixQuote = do
|
|||||||
closeQuotes <- T.pack <$> some (oneOf quotesym)
|
closeQuotes <- T.pack <$> some (oneOf quotesym)
|
||||||
return $ openQuotes <> T.strip content <> closeQuotes
|
return $ openQuotes <> T.strip content <> closeQuotes
|
||||||
|
|
||||||
-- the rule set
|
cjkpossessivequote :: Rule
|
||||||
myRules :: RuleSet
|
cjkpossessivequote = do
|
||||||
myRules =
|
cjk <- cjkChar
|
||||||
|
_ <- char '\''
|
||||||
|
_ <- lookAhead $ anySingleBut 's'
|
||||||
|
return $ T.pack $ cjk : " '"
|
||||||
|
|
||||||
|
-- This singlequoteCJK rule will turn '你好' into ' 你好'
|
||||||
|
-- which seems not desirable...
|
||||||
|
-- however, the behavior is aligned with python version
|
||||||
|
singlequoteCJK :: Rule
|
||||||
|
singlequoteCJK = do
|
||||||
|
_ <- char '\''
|
||||||
|
cjk <- cjkChar
|
||||||
|
return $ T.pack $ "' " ++ [cjk]
|
||||||
|
|
||||||
|
fixPossessivequote :: Rule
|
||||||
|
fixPossessivequote = do
|
||||||
|
pre <- cjkChar <|> alphanumericChar
|
||||||
|
_ <- some spaceChar
|
||||||
|
_ <- chunk "'s"
|
||||||
|
return $ T.pack $ pre : "'s"
|
||||||
|
|
||||||
|
-- hash
|
||||||
|
hashANSCJKhash :: Rule
|
||||||
|
hashANSCJKhash = do
|
||||||
|
cjk1 <- cjkChar
|
||||||
|
_ <- char '#'
|
||||||
|
mid <- some cjkChar
|
||||||
|
_ <- char '#'
|
||||||
|
cjk2 <- cjkChar
|
||||||
|
return $ T.pack $ [cjk1] ++ " #" ++ mid ++ "# " ++ [cjk2]
|
||||||
|
|
||||||
|
cjkhash :: Rule
|
||||||
|
cjkhash = do
|
||||||
|
cjk <- cjkChar
|
||||||
|
_ <- char '#'
|
||||||
|
_ <- lookAhead $ anySingleBut ' '
|
||||||
|
return $ T.pack $ cjk : " #"
|
||||||
|
|
||||||
|
hashcjk :: Rule
|
||||||
|
hashcjk = do
|
||||||
|
_ <- char '#'
|
||||||
|
_ <- lookAhead $ anySingleBut ' '
|
||||||
|
cjk <- cjkChar
|
||||||
|
return $ T.pack $ "# " ++ [cjk]
|
||||||
|
|
||||||
|
-- operators
|
||||||
|
cjkOPTan :: Rule
|
||||||
|
cjkOPTan = do
|
||||||
|
cjk <- cjkChar
|
||||||
|
opt <- oneOf ("+-=*/&|<>%" :: [Char])
|
||||||
|
an <- alphanumericChar
|
||||||
|
return $ T.pack [cjk, ' ', opt, ' ', an]
|
||||||
|
|
||||||
|
anOPTcjk :: Rule
|
||||||
|
anOPTcjk = do
|
||||||
|
an <- alphanumericChar
|
||||||
|
opt <- oneOf ("+-=*/&|<>%" :: [Char])
|
||||||
|
cjk <- cjkChar
|
||||||
|
return $ T.pack [an, ' ', opt, ' ', cjk]
|
||||||
|
|
||||||
|
-- slash/bracket rules are not implemented
|
||||||
|
|
||||||
|
-- CJK and alphanumeric without space
|
||||||
|
|
||||||
|
cjkans :: Rule
|
||||||
|
cjkans = do
|
||||||
|
cjk <- cjkChar
|
||||||
|
_ <- lookAhead (alphanumericChar <|> oneOf ("@$%^&*-+\\=|/" :: [Char]))
|
||||||
|
return $ T.pack [cjk, ' ']
|
||||||
|
|
||||||
|
anscjk :: Rule
|
||||||
|
anscjk = do
|
||||||
|
an <- alphanumericChar <|> oneOf ("~!$%^&*-+\\=|;:,./?" :: [Char])
|
||||||
|
_ <- lookAhead cjkChar
|
||||||
|
return $ T.pack [an, ' ']
|
||||||
|
|
||||||
|
-- rule set, the order matters
|
||||||
|
recursiveRules :: RuleSet
|
||||||
|
recursiveRules =
|
||||||
[ fullwidthCJKsymCJK,
|
[ fullwidthCJKsymCJK,
|
||||||
fullwidthCJKsym,
|
fullwidthCJKsym
|
||||||
dotsCJK,
|
]
|
||||||
|
|
||||||
|
onepassRules :: RuleSet
|
||||||
|
onepassRules =
|
||||||
|
[ dotsCJK,
|
||||||
fixCJKcolAN,
|
fixCJKcolAN,
|
||||||
cjkquote,
|
cjkquote,
|
||||||
quoteCJK,
|
quoteCJK,
|
||||||
fixQuote
|
fixQuote,
|
||||||
|
cjkpossessivequote,
|
||||||
|
-- singlequoteCJK,
|
||||||
|
fixPossessivequote,
|
||||||
|
hashANSCJKhash,
|
||||||
|
cjkhash,
|
||||||
|
-- hashcjk,
|
||||||
|
anscjk,
|
||||||
|
cjkans,
|
||||||
|
empty -- a dummy rule
|
||||||
]
|
]
|
||||||
|
|
||||||
|
pangu :: Text -> Text
|
||||||
|
pangu input = applyRules onepassRules $ applyRulesRecursively recursiveRules input
|
||||||
29
test/Main.hs
29
test/Main.hs
@@ -8,14 +8,21 @@ main :: IO ()
|
|||||||
main = hspec $ do
|
main = hspec $ do
|
||||||
describe "MyLib.cjksym(cjk)" $ do
|
describe "MyLib.cjksym(cjk)" $ do
|
||||||
it "converts symbols to fullwidth" $ do
|
it "converts symbols to fullwidth" $ do
|
||||||
applyRules myRules "你 : 好" `shouldBe` "你:好"
|
pangu "你 : 好" `shouldBe` "你:好"
|
||||||
applyRules myRules "你.好" `shouldBe` "你。好"
|
pangu "你.好" `shouldBe` "你。好"
|
||||||
applyRules myRules "你:好:他" `shouldBe` "你:好:他"
|
pangu "你:好:他" `shouldBe` "你:好:他"
|
||||||
applyRules myRules "你 ? 好" `shouldBe` "你?好"
|
pangu "你 ? 好" `shouldBe` "你?好"
|
||||||
applyRules myRules "你…好" `shouldBe` "你… 好"
|
pangu "你…好" `shouldBe` "你… 好"
|
||||||
applyRules myRules "你...好" `shouldBe` "你... 好"
|
pangu "你...好" `shouldBe` "你... 好"
|
||||||
applyRules myRules "你:0" `shouldBe` "你:0"
|
pangu "你:0" `shouldBe` "你:0"
|
||||||
applyRules myRules "我说:\" 他说:\'你好\'\"" `shouldBe` "我说:\"他说:\'你好\'\""
|
it "fixes quotes" $ do
|
||||||
it "adds spaces" $ do
|
pangu "我说:\" 他说:'你好'\"" `shouldBe` "我说:\"他说:' 你好 '\""
|
||||||
applyRules myRules "\'你好\'" `shouldBe` "\'你好\'"
|
-- pangu "'你好'" `shouldBe` "' 你好'" -- strange behavior
|
||||||
applyRules myRules "你\'hello\'" `shouldBe` "你 \'hello\'"
|
pangu "你'hello'" `shouldBe` "你 'hello'"
|
||||||
|
pangu "我 's " `shouldBe` "我's "
|
||||||
|
it "fixes hash" $ do
|
||||||
|
pangu "你好#测试#世界" `shouldBe` "你好 #测试# 世界"
|
||||||
|
it "add spaces" $ do
|
||||||
|
pangu "你好and世界" `shouldBe` "你好 and 世界"
|
||||||
|
pangu "當你凝視著bug,bug也凝視著你" `shouldBe` "當你凝視著 bug,bug 也凝視著你"
|
||||||
|
pangu "與PM戰鬥的人,應當小心自己不要成為PM" `shouldBe` "與 PM 戰鬥的人,應當小心自己不要成為 PM"
|
||||||
Reference in New Issue
Block a user