Compare commits
2 Commits
0f8ce47fa1
...
792bbc80c6
| Author | SHA1 | Date | |
|---|---|---|---|
| 792bbc80c6 | |||
| a2ea14dde2 |
104
src/MyLib.hs
104
src/MyLib.hs
@@ -2,6 +2,7 @@
|
|||||||
|
|
||||||
module MyLib where
|
module MyLib where
|
||||||
|
|
||||||
|
import Data.Function (fix)
|
||||||
import Data.Text (Text)
|
import Data.Text (Text)
|
||||||
import qualified Data.Text as T
|
import qualified Data.Text as T
|
||||||
import Data.Void (Void)
|
import Data.Void (Void)
|
||||||
@@ -16,20 +17,16 @@ type Rule = Parser Text
|
|||||||
|
|
||||||
type RuleSet = [Rule]
|
type RuleSet = [Rule]
|
||||||
|
|
||||||
|
applyUntilFixed :: Rule -> Text -> Text
|
||||||
|
applyUntilFixed rule =
|
||||||
|
fix
|
||||||
|
( \loop current ->
|
||||||
|
let next = streamEdit rule id current
|
||||||
|
in if next == current then next else loop next
|
||||||
|
)
|
||||||
|
|
||||||
applyRules :: RuleSet -> Text -> Text
|
applyRules :: RuleSet -> Text -> Text
|
||||||
applyRules [] input = input
|
applyRules rules input = foldl (flip applyUntilFixed) input rules
|
||||||
applyRules rules input = streamEdit (choice rules) id input
|
|
||||||
|
|
||||||
-- -- TEST RULES
|
|
||||||
-- appleToOrange :: Rule
|
|
||||||
-- appleToOrange = "orange" <$ chunk "apple"
|
|
||||||
|
|
||||||
-- emailAtRule :: Rule
|
|
||||||
-- emailAtRule = do
|
|
||||||
-- prefix <- some (alphaNumChar <|> oneOf ("._%+-" :: String))
|
|
||||||
-- _ <- char '@'
|
|
||||||
-- suffix <- some (alphaNumChar <|> oneOf (".-" :: String))
|
|
||||||
-- return $ T.pack prefix <> "[at]" <> T.pack suffix
|
|
||||||
|
|
||||||
-------------------------------------------------------------------------------
|
-------------------------------------------------------------------------------
|
||||||
-- rules for pangu
|
-- rules for pangu
|
||||||
@@ -61,24 +58,81 @@ convertToFullwidth c =
|
|||||||
'?' -> '?'
|
'?' -> '?'
|
||||||
',' -> ','
|
',' -> ','
|
||||||
';' -> ';'
|
';' -> ';'
|
||||||
|
'\"' -> '”'
|
||||||
|
'\'' -> '’'
|
||||||
_ -> c
|
_ -> c
|
||||||
|
|
||||||
-- A parser that matches a single CJK character
|
-- A parser that matches a single CJK character
|
||||||
cjkChar :: Parser Char
|
cjkChar :: Parser Char
|
||||||
cjkChar = satisfy isCJK
|
cjkChar = satisfy isCJK
|
||||||
|
|
||||||
cjksymcjk :: Rule
|
-- use python.py as reference for these rules
|
||||||
cjksymcjk = do
|
|
||||||
c1 <- cjkChar
|
fullwidthCJKsymCJK :: Rule
|
||||||
mid <- do
|
fullwidthCJKsymCJK = do
|
||||||
_ <- many (char ' ') -- leading spaces
|
lcjk <- cjkChar
|
||||||
core <- some $ oneOf (":.~!?,;" :: [Char])
|
_ <- many (char ' ')
|
||||||
_ <- many (char ' ') -- trailing spaces
|
sym <- try (some (char ':')) <|> count 1 (char '.')
|
||||||
return $ T.pack core
|
_ <- many (char ' ')
|
||||||
c2 <- cjkChar
|
rcjk <- cjkChar
|
||||||
let transformedMid = T.pack $ map convertToFullwidth (T.unpack mid)
|
let transformedsym = map convertToFullwidth sym
|
||||||
return $ T.singleton c1 <> transformedMid <> T.singleton c2
|
return $ T.pack $ [lcjk] ++ transformedsym ++ [rcjk]
|
||||||
|
|
||||||
|
fullwidthCJKsym :: Rule
|
||||||
|
fullwidthCJKsym = do
|
||||||
|
cjk <- cjkChar
|
||||||
|
_ <- many (char ' ')
|
||||||
|
sym <- some $ oneOf ("~!?,;" :: [Char])
|
||||||
|
_ <- many (char ' ')
|
||||||
|
let transformedsym = T.pack $ map convertToFullwidth sym
|
||||||
|
return $ T.pack [cjk] <> transformedsym
|
||||||
|
|
||||||
|
dotsCJK :: Rule
|
||||||
|
dotsCJK = do
|
||||||
|
dots <- chunk "..." <|> chunk "…"
|
||||||
|
cjk <- cjkChar
|
||||||
|
return $ dots <> T.pack (" " ++ [cjk])
|
||||||
|
|
||||||
|
fixCJKcolAN :: Rule
|
||||||
|
fixCJKcolAN = do
|
||||||
|
cjk <- cjkChar
|
||||||
|
_ <- chunk ":"
|
||||||
|
an <- alphaNumChar
|
||||||
|
return $ T.pack $ [cjk] ++ ":" ++ [an]
|
||||||
|
|
||||||
|
-- quotes
|
||||||
|
-- seems confusing ...
|
||||||
|
quotesym :: [Char]
|
||||||
|
quotesym = "\x05f4\"\'`"
|
||||||
|
|
||||||
|
cjkquote :: Rule
|
||||||
|
cjkquote = do
|
||||||
|
cjk <- cjkChar
|
||||||
|
quote <- oneOf quotesym
|
||||||
|
return $ T.pack $ [cjk] ++ " " ++ [quote]
|
||||||
|
|
||||||
|
quoteCJK :: Rule
|
||||||
|
quoteCJK = do
|
||||||
|
quote <- oneOf quotesym
|
||||||
|
cjk <- cjkChar
|
||||||
|
return $ T.pack $ [quote] ++ " " ++ [cjk]
|
||||||
|
|
||||||
|
fixQuote :: Rule
|
||||||
|
fixQuote = do
|
||||||
|
openQuotes <- T.pack <$> some (oneOf quotesym)
|
||||||
|
_ <- many spaceChar
|
||||||
|
content <- T.pack <$> someTill anySingle (lookAhead $ some (oneOf quotesym))
|
||||||
|
closeQuotes <- T.pack <$> some (oneOf quotesym)
|
||||||
|
return $ openQuotes <> T.strip content <> closeQuotes
|
||||||
|
|
||||||
-- the rule set
|
-- the rule set
|
||||||
myRules :: RuleSet
|
myRules :: RuleSet
|
||||||
myRules = [cjksymcjk]
|
myRules =
|
||||||
|
[ fullwidthCJKsymCJK,
|
||||||
|
fullwidthCJKsym,
|
||||||
|
dotsCJK,
|
||||||
|
fixCJKcolAN,
|
||||||
|
cjkquote,
|
||||||
|
quoteCJK,
|
||||||
|
fixQuote
|
||||||
|
]
|
||||||
12
test/Main.hs
12
test/Main.hs
@@ -6,8 +6,16 @@ import Test.Hspec
|
|||||||
|
|
||||||
main :: IO ()
|
main :: IO ()
|
||||||
main = hspec $ do
|
main = hspec $ do
|
||||||
describe "MyLib.cjksymcjk" $ do
|
describe "MyLib.cjksym(cjk)" $ do
|
||||||
it "converts symbols between CJK characters to fullwidth" $ do
|
it "converts symbols to fullwidth" $ do
|
||||||
applyRules myRules "你 : 好" `shouldBe` "你:好"
|
applyRules myRules "你 : 好" `shouldBe` "你:好"
|
||||||
applyRules myRules "你.好" `shouldBe` "你。好"
|
applyRules myRules "你.好" `shouldBe` "你。好"
|
||||||
|
applyRules myRules "你:好:他" `shouldBe` "你:好:他"
|
||||||
applyRules myRules "你 ? 好" `shouldBe` "你?好"
|
applyRules myRules "你 ? 好" `shouldBe` "你?好"
|
||||||
|
applyRules myRules "你…好" `shouldBe` "你… 好"
|
||||||
|
applyRules myRules "你...好" `shouldBe` "你... 好"
|
||||||
|
applyRules myRules "你:0" `shouldBe` "你:0"
|
||||||
|
applyRules myRules "我说:\" 他说:\'你好\'\"" `shouldBe` "我说:\"他说:\'你好\'\""
|
||||||
|
it "adds spaces" $ do
|
||||||
|
applyRules myRules "\'你好\'" `shouldBe` "\'你好\'"
|
||||||
|
applyRules myRules "你\'hello\'" `shouldBe` "你 \'hello\'"
|
||||||
Reference in New Issue
Block a user