Compare commits
4 Commits
792bbc80c6
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
| 984069530e | |||
| 211df1191f | |||
| 0418e29edf | |||
| 5ba00b7fc9 |
@@ -1,5 +0,0 @@
|
|||||||
# Revision history for panguFilter
|
|
||||||
|
|
||||||
## 0.1.0.0 -- YYYY-mm-dd
|
|
||||||
|
|
||||||
* First version. Released on an unsuspecting world.
|
|
||||||
11
README.md
Normal file
11
README.md
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
# pangu.hs
|
||||||
|
|
||||||
|
Insert whitespace between CJK and half-width characters.
|
||||||
|
|
||||||
|
This haskell version implements a subset of spacing rules in [pangu.py](https://github.com/vinta/pangu.py).
|
||||||
|
|
||||||
|
## test
|
||||||
|
|
||||||
|
```sh
|
||||||
|
cabal test
|
||||||
|
```
|
||||||
191
pangu.py
191
pangu.py
@@ -1,191 +0,0 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
# coding: utf-8
|
|
||||||
"""
|
|
||||||
Paranoid text spacing for good readability, to automatically insert whitespace between CJK (Chinese, Japanese, Korean) and half-width characters (alphabetical letters, numerical digits and symbols).
|
|
||||||
|
|
||||||
>>> import pangu
|
|
||||||
>>> nwe_text = pangu.spacing_text('當你凝視著bug,bug也凝視著你')
|
|
||||||
>>> print(nwe_text)
|
|
||||||
'當你凝視著 bug,bug 也凝視著你'
|
|
||||||
>>> nwe_content = pangu.spacing_file('path/to/file.txt')
|
|
||||||
>>> print(nwe_content)
|
|
||||||
'與 PM 戰鬥的人,應當小心自己不要成為 PM'
|
|
||||||
"""
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import os
|
|
||||||
import re
|
|
||||||
import sys
|
|
||||||
|
|
||||||
__version__ = '4.0.6.1'
|
|
||||||
__all__ = ['spacing_text', 'spacing_file', 'spacing', 'cli']
|
|
||||||
|
|
||||||
CJK = r'\u2e80-\u2eff\u2f00-\u2fdf\u3040-\u309f\u30a0-\u30fa\u30fc-\u30ff\u3100-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff'
|
|
||||||
|
|
||||||
ANY_CJK = re.compile(r'[{CJK}]'.format(CJK=CJK))
|
|
||||||
|
|
||||||
CONVERT_TO_FULLWIDTH_CJK_SYMBOLS_CJK = re.compile('([{CJK}])([ ]*(?:[\\:]+|\\.)[ ]*)([{CJK}])'.format(CJK=CJK)) # there is an extra non-capturing group compared to JavaScript version
|
|
||||||
CONVERT_TO_FULLWIDTH_CJK_SYMBOLS = re.compile('([{CJK}])[ ]*([~\\!;,\\?]+)[ ]*'.format(CJK=CJK))
|
|
||||||
DOTS_CJK = re.compile('([\\.]{{2,}}|\u2026)([{CJK}])'.format(CJK=CJK)) # need to escape { }
|
|
||||||
FIX_CJK_COLON_ANS = re.compile('([{CJK}])\\:([A-Z0-9\\(\\)])'.format(CJK=CJK))
|
|
||||||
|
|
||||||
CJK_QUOTE = re.compile('([{CJK}])([`"\u05f4])'.format(CJK=CJK)) # no need to escape `
|
|
||||||
QUOTE_CJK = re.compile('([`"\u05f4])([{CJK}])'.format(CJK=CJK)) # no need to escape `
|
|
||||||
FIX_QUOTE_ANY_QUOTE = re.compile(r'([`"\u05f4]+)(\s*)(.+?)(\s*)([`"\u05f4]+)')
|
|
||||||
|
|
||||||
CJK_SINGLE_QUOTE_BUT_POSSESSIVE = re.compile("([{CJK}])('[^s])".format(CJK=CJK))
|
|
||||||
SINGLE_QUOTE_CJK = re.compile("(')([{CJK}])".format(CJK=CJK))
|
|
||||||
FIX_POSSESSIVE_SINGLE_QUOTE = re.compile("([{CJK}A-Za-z0-9])( )('s)".format(CJK=CJK))
|
|
||||||
|
|
||||||
HASH_ANS_CJK_HASH = re.compile('([{CJK}])(#)([{CJK}]+)(#)([{CJK}])'.format(CJK=CJK))
|
|
||||||
CJK_HASH = re.compile('([{CJK}])(#([^ ]))'.format(CJK=CJK))
|
|
||||||
HASH_CJK = re.compile('(([^ ])#)([{CJK}])'.format(CJK=CJK))
|
|
||||||
|
|
||||||
CJK_OPERATOR_ANS = re.compile('([{CJK}])([\\+\\-\\*\\/=&\\|<>])([A-Za-z0-9])'.format(CJK=CJK))
|
|
||||||
ANS_OPERATOR_CJK = re.compile('([A-Za-z0-9])([\\+\\-\\*\\/=&\\|<>])([{CJK}])'.format(CJK=CJK))
|
|
||||||
|
|
||||||
FIX_SLASH_AS = re.compile(r'([/]) ([a-z\-_\./]+)')
|
|
||||||
FIX_SLASH_AS_SLASH = re.compile(r'([/\.])([A-Za-z\-_\./]+) ([/])')
|
|
||||||
|
|
||||||
CJK_LEFT_BRACKET = re.compile('([{CJK}])([\\(\\[\\{{<>\u201c])'.format(CJK=CJK)) # need to escape {
|
|
||||||
RIGHT_BRACKET_CJK = re.compile('([\\)\\]\\}}<>\u201d])([{CJK}])'.format(CJK=CJK)) # need to escape }
|
|
||||||
FIX_LEFT_BRACKET_ANY_RIGHT_BRACKET = re.compile(r'([\(\[\{<\u201c]+)(\s*)(.+?)(\s*)([\)\]\}>\u201d]+)') # need to escape { }
|
|
||||||
ANS_CJK_LEFT_BRACKET_ANY_RIGHT_BRACKET = re.compile('([A-Za-z0-9{CJK}])[ ]*([\u201c])([A-Za-z0-9{CJK}\\-_ ]+)([\u201d])'.format(CJK=CJK))
|
|
||||||
LEFT_BRACKET_ANY_RIGHT_BRACKET_ANS_CJK = re.compile('([\u201c])([A-Za-z0-9{CJK}\\-_ ]+)([\u201d])[ ]*([A-Za-z0-9{CJK}])'.format(CJK=CJK))
|
|
||||||
|
|
||||||
AN_LEFT_BRACKET = re.compile(r'([A-Za-z0-9])([\(\[\{])')
|
|
||||||
RIGHT_BRACKET_AN = re.compile(r'([\)\]\}])([A-Za-z0-9])')
|
|
||||||
|
|
||||||
CJK_ANS = re.compile('([{CJK}])([A-Za-z\u0370-\u03ff0-9@\\$%\\^&\\*\\-\\+\\\\=\\|/\u00a1-\u00ff\u2150-\u218f\u2700—\u27bf])'.format(CJK=CJK))
|
|
||||||
ANS_CJK = re.compile('([A-Za-z\u0370-\u03ff0-9~\\!\\$%\\^&\\*\\-\\+\\\\=\\|;:,\\./\\?\u00a1-\u00ff\u2150-\u218f\u2700—\u27bf])([{CJK}])'.format(CJK=CJK))
|
|
||||||
|
|
||||||
S_A = re.compile(r'(%)([A-Za-z])')
|
|
||||||
|
|
||||||
MIDDLE_DOT = re.compile(r'([ ]*)([\u00b7\u2022\u2027])([ ]*)')
|
|
||||||
|
|
||||||
# Python version only
|
|
||||||
TILDES = re.compile(r'~+')
|
|
||||||
EXCLAMATION_MARKS = re.compile(r'!+')
|
|
||||||
SEMICOLONS = re.compile(r';+')
|
|
||||||
COLONS = re.compile(r':+')
|
|
||||||
COMMAS = re.compile(r',+')
|
|
||||||
PERIODS = re.compile(r'\.+')
|
|
||||||
QUESTION_MARKS = re.compile(r'\?+')
|
|
||||||
|
|
||||||
|
|
||||||
def convert_to_fullwidth(symbols):
|
|
||||||
symbols = TILDES.sub('~', symbols)
|
|
||||||
symbols = EXCLAMATION_MARKS.sub('!', symbols)
|
|
||||||
symbols = SEMICOLONS.sub(';', symbols)
|
|
||||||
symbols = COLONS.sub(':', symbols)
|
|
||||||
symbols = COMMAS.sub(',', symbols)
|
|
||||||
symbols = PERIODS.sub('。', symbols)
|
|
||||||
symbols = QUESTION_MARKS.sub('?', symbols)
|
|
||||||
return symbols.strip()
|
|
||||||
|
|
||||||
|
|
||||||
def spacing(text):
|
|
||||||
"""
|
|
||||||
Perform paranoid text spacing on text.
|
|
||||||
"""
|
|
||||||
if len(text) <= 1 or not ANY_CJK.search(text):
|
|
||||||
return text
|
|
||||||
|
|
||||||
new_text = text
|
|
||||||
|
|
||||||
# TODO: refactoring
|
|
||||||
matched = CONVERT_TO_FULLWIDTH_CJK_SYMBOLS_CJK.search(new_text)
|
|
||||||
while matched:
|
|
||||||
start, end = matched.span()
|
|
||||||
new_text = ''.join((new_text[:start + 1], convert_to_fullwidth(new_text[start + 1:end - 1]), new_text[end - 1:]))
|
|
||||||
matched = CONVERT_TO_FULLWIDTH_CJK_SYMBOLS_CJK.search(new_text)
|
|
||||||
|
|
||||||
matched = CONVERT_TO_FULLWIDTH_CJK_SYMBOLS.search(new_text)
|
|
||||||
while matched:
|
|
||||||
start, end = matched.span()
|
|
||||||
new_text = ''.join((new_text[:start + 1].strip(), convert_to_fullwidth(new_text[start + 1:end]), new_text[end:].strip()))
|
|
||||||
matched = CONVERT_TO_FULLWIDTH_CJK_SYMBOLS.search(new_text)
|
|
||||||
|
|
||||||
new_text = DOTS_CJK.sub(r'\1 \2', new_text)
|
|
||||||
new_text = FIX_CJK_COLON_ANS.sub(r'\1:\2', new_text)
|
|
||||||
|
|
||||||
new_text = CJK_QUOTE.sub(r'\1 \2', new_text)
|
|
||||||
new_text = QUOTE_CJK.sub(r'\1 \2', new_text)
|
|
||||||
new_text = FIX_QUOTE_ANY_QUOTE.sub(r'\1\3\5', new_text)
|
|
||||||
|
|
||||||
new_text = CJK_SINGLE_QUOTE_BUT_POSSESSIVE.sub(r'\1 \2', new_text)
|
|
||||||
new_text = SINGLE_QUOTE_CJK.sub(r'\1 \2', new_text)
|
|
||||||
new_text = FIX_POSSESSIVE_SINGLE_QUOTE.sub(r"\1's", new_text)
|
|
||||||
|
|
||||||
new_text = HASH_ANS_CJK_HASH.sub(r'\1 \2\3\4 \5', new_text)
|
|
||||||
new_text = CJK_HASH.sub(r'\1 \2', new_text)
|
|
||||||
new_text = HASH_CJK.sub(r'\1 \3', new_text)
|
|
||||||
|
|
||||||
new_text = CJK_OPERATOR_ANS.sub(r'\1 \2 \3', new_text)
|
|
||||||
new_text = ANS_OPERATOR_CJK.sub(r'\1 \2 \3', new_text)
|
|
||||||
|
|
||||||
new_text = FIX_SLASH_AS.sub(r'\1\2', new_text)
|
|
||||||
new_text = FIX_SLASH_AS_SLASH.sub(r'\1\2\3', new_text)
|
|
||||||
|
|
||||||
new_text = CJK_LEFT_BRACKET.sub(r'\1 \2', new_text)
|
|
||||||
new_text = RIGHT_BRACKET_CJK.sub(r'\1 \2', new_text)
|
|
||||||
new_text = FIX_LEFT_BRACKET_ANY_RIGHT_BRACKET.sub(r'\1\3\5', new_text)
|
|
||||||
new_text = ANS_CJK_LEFT_BRACKET_ANY_RIGHT_BRACKET.sub(r'\1 \2\3\4', new_text)
|
|
||||||
new_text = LEFT_BRACKET_ANY_RIGHT_BRACKET_ANS_CJK.sub(r'\1\2\3 \4', new_text)
|
|
||||||
|
|
||||||
new_text = AN_LEFT_BRACKET.sub(r'\1 \2', new_text)
|
|
||||||
new_text = RIGHT_BRACKET_AN.sub(r'\1 \2', new_text)
|
|
||||||
|
|
||||||
new_text = CJK_ANS.sub(r'\1 \2', new_text)
|
|
||||||
new_text = ANS_CJK.sub(r'\1 \2', new_text)
|
|
||||||
|
|
||||||
new_text = S_A.sub(r'\1 \2', new_text)
|
|
||||||
|
|
||||||
new_text = MIDDLE_DOT.sub('・', new_text)
|
|
||||||
|
|
||||||
return new_text.strip()
|
|
||||||
|
|
||||||
|
|
||||||
def spacing_text(text):
|
|
||||||
"""
|
|
||||||
Perform paranoid text spacing on text. An alias of `spacing()`.
|
|
||||||
"""
|
|
||||||
return spacing(text)
|
|
||||||
|
|
||||||
|
|
||||||
def spacing_file(path):
|
|
||||||
"""
|
|
||||||
Perform paranoid text spacing from file.
|
|
||||||
"""
|
|
||||||
# TODO: read line by line
|
|
||||||
with open(os.path.abspath(path)) as f:
|
|
||||||
return spacing_text(f.read())
|
|
||||||
|
|
||||||
|
|
||||||
def cli(args=None):
|
|
||||||
if not args:
|
|
||||||
args = sys.argv[1:]
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
prog='pangu',
|
|
||||||
description='pangu.py -- Paranoid text spacing for good readability, to automatically insert whitespace between CJK and half-width characters (alphabetical letters, numerical digits and symbols).',
|
|
||||||
)
|
|
||||||
parser.add_argument('-v', '--version', action='version', version=__version__)
|
|
||||||
parser.add_argument('-t', '--text', action='store_true', dest='is_text', required=False, help='specify the input value is a text')
|
|
||||||
parser.add_argument('-f', '--file', action='store_true', dest='is_file', required=False, help='specify the input value is a file path')
|
|
||||||
parser.add_argument('text_or_path', action='store', type=str, help='the text or file path to apply spacing')
|
|
||||||
|
|
||||||
if not sys.stdin.isatty():
|
|
||||||
print(spacing_text(sys.stdin.read())) # noqa: T003
|
|
||||||
else:
|
|
||||||
args = parser.parse_args(args)
|
|
||||||
if args.is_text:
|
|
||||||
print(spacing_text(args.text_or_path)) # noqa: T003
|
|
||||||
elif args.is_file:
|
|
||||||
print(spacing_file(args.text_or_path)) # noqa: T003
|
|
||||||
else:
|
|
||||||
print(spacing_text(args.text_or_path)) # noqa: T003
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
cli()
|
|
||||||
@@ -46,7 +46,7 @@ category: Text
|
|||||||
build-type: Simple
|
build-type: Simple
|
||||||
|
|
||||||
-- Extra doc files to be distributed with the package, such as a CHANGELOG or a README.
|
-- Extra doc files to be distributed with the package, such as a CHANGELOG or a README.
|
||||||
extra-doc-files: CHANGELOG.md
|
-- extra-doc-files: CHANGELOG.md
|
||||||
|
|
||||||
-- Extra source files to be distributed with the package, such as examples, or a tutorial module.
|
-- Extra source files to be distributed with the package, such as examples, or a tutorial module.
|
||||||
-- extra-source-files:
|
-- extra-source-files:
|
||||||
@@ -59,7 +59,7 @@ library
|
|||||||
import: warnings
|
import: warnings
|
||||||
|
|
||||||
-- Modules exported by the library.
|
-- Modules exported by the library.
|
||||||
exposed-modules: MyLib
|
exposed-modules: Pangu
|
||||||
|
|
||||||
-- Modules included in this library but not exported.
|
-- Modules included in this library but not exported.
|
||||||
-- other-modules:
|
-- other-modules:
|
||||||
|
|||||||
138
src/MyLib.hs
138
src/MyLib.hs
@@ -1,138 +0,0 @@
|
|||||||
{-# LANGUAGE OverloadedStrings #-}
|
|
||||||
|
|
||||||
module MyLib where
|
|
||||||
|
|
||||||
import Data.Function (fix)
|
|
||||||
import Data.Text (Text)
|
|
||||||
import qualified Data.Text as T
|
|
||||||
import Data.Void (Void)
|
|
||||||
import Replace.Megaparsec (streamEdit)
|
|
||||||
import Text.Megaparsec
|
|
||||||
import Text.Megaparsec.Char
|
|
||||||
|
|
||||||
-------------------------------------------------------------------------------
|
|
||||||
type Parser = Parsec Void Text
|
|
||||||
|
|
||||||
type Rule = Parser Text
|
|
||||||
|
|
||||||
type RuleSet = [Rule]
|
|
||||||
|
|
||||||
applyUntilFixed :: Rule -> Text -> Text
|
|
||||||
applyUntilFixed rule =
|
|
||||||
fix
|
|
||||||
( \loop current ->
|
|
||||||
let next = streamEdit rule id current
|
|
||||||
in if next == current then next else loop next
|
|
||||||
)
|
|
||||||
|
|
||||||
applyRules :: RuleSet -> Text -> Text
|
|
||||||
applyRules rules input = foldl (flip applyUntilFixed) input rules
|
|
||||||
|
|
||||||
-------------------------------------------------------------------------------
|
|
||||||
-- rules for pangu
|
|
||||||
|
|
||||||
-- | Check if a character falls within the CJK ranges provided
|
|
||||||
isCJK :: Char -> Bool
|
|
||||||
isCJK c = any (\(start, end) -> c >= start && c <= end) cjkRanges
|
|
||||||
where
|
|
||||||
cjkRanges =
|
|
||||||
[ ('\x2e80', '\x2eff'),
|
|
||||||
('\x2f00', '\x2fdf'),
|
|
||||||
('\x3040', '\x309f'),
|
|
||||||
('\x30a0', '\x30fa'),
|
|
||||||
('\x30fc', '\x30ff'),
|
|
||||||
('\x3100', '\x312f'),
|
|
||||||
('\x3200', '\x32ff'),
|
|
||||||
('\x3400', '\x4dbf'),
|
|
||||||
('\x4e00', '\x9fff'),
|
|
||||||
('\xf900', '\xfaff')
|
|
||||||
]
|
|
||||||
|
|
||||||
convertToFullwidth :: Char -> Char
|
|
||||||
convertToFullwidth c =
|
|
||||||
case c of
|
|
||||||
':' -> ':'
|
|
||||||
'.' -> '。'
|
|
||||||
'~' -> '~'
|
|
||||||
'!' -> '!'
|
|
||||||
'?' -> '?'
|
|
||||||
',' -> ','
|
|
||||||
';' -> ';'
|
|
||||||
'\"' -> '”'
|
|
||||||
'\'' -> '’'
|
|
||||||
_ -> c
|
|
||||||
|
|
||||||
-- A parser that matches a single CJK character
|
|
||||||
cjkChar :: Parser Char
|
|
||||||
cjkChar = satisfy isCJK
|
|
||||||
|
|
||||||
-- use python.py as reference for these rules
|
|
||||||
|
|
||||||
fullwidthCJKsymCJK :: Rule
|
|
||||||
fullwidthCJKsymCJK = do
|
|
||||||
lcjk <- cjkChar
|
|
||||||
_ <- many (char ' ')
|
|
||||||
sym <- try (some (char ':')) <|> count 1 (char '.')
|
|
||||||
_ <- many (char ' ')
|
|
||||||
rcjk <- cjkChar
|
|
||||||
let transformedsym = map convertToFullwidth sym
|
|
||||||
return $ T.pack $ [lcjk] ++ transformedsym ++ [rcjk]
|
|
||||||
|
|
||||||
fullwidthCJKsym :: Rule
|
|
||||||
fullwidthCJKsym = do
|
|
||||||
cjk <- cjkChar
|
|
||||||
_ <- many (char ' ')
|
|
||||||
sym <- some $ oneOf ("~!?,;" :: [Char])
|
|
||||||
_ <- many (char ' ')
|
|
||||||
let transformedsym = T.pack $ map convertToFullwidth sym
|
|
||||||
return $ T.pack [cjk] <> transformedsym
|
|
||||||
|
|
||||||
dotsCJK :: Rule
|
|
||||||
dotsCJK = do
|
|
||||||
dots <- chunk "..." <|> chunk "…"
|
|
||||||
cjk <- cjkChar
|
|
||||||
return $ dots <> T.pack (" " ++ [cjk])
|
|
||||||
|
|
||||||
fixCJKcolAN :: Rule
|
|
||||||
fixCJKcolAN = do
|
|
||||||
cjk <- cjkChar
|
|
||||||
_ <- chunk ":"
|
|
||||||
an <- alphaNumChar
|
|
||||||
return $ T.pack $ [cjk] ++ ":" ++ [an]
|
|
||||||
|
|
||||||
-- quotes
|
|
||||||
-- seems confusing ...
|
|
||||||
quotesym :: [Char]
|
|
||||||
quotesym = "\x05f4\"\'`"
|
|
||||||
|
|
||||||
cjkquote :: Rule
|
|
||||||
cjkquote = do
|
|
||||||
cjk <- cjkChar
|
|
||||||
quote <- oneOf quotesym
|
|
||||||
return $ T.pack $ [cjk] ++ " " ++ [quote]
|
|
||||||
|
|
||||||
quoteCJK :: Rule
|
|
||||||
quoteCJK = do
|
|
||||||
quote <- oneOf quotesym
|
|
||||||
cjk <- cjkChar
|
|
||||||
return $ T.pack $ [quote] ++ " " ++ [cjk]
|
|
||||||
|
|
||||||
fixQuote :: Rule
|
|
||||||
fixQuote = do
|
|
||||||
openQuotes <- T.pack <$> some (oneOf quotesym)
|
|
||||||
_ <- many spaceChar
|
|
||||||
content <- T.pack <$> someTill anySingle (lookAhead $ some (oneOf quotesym))
|
|
||||||
closeQuotes <- T.pack <$> some (oneOf quotesym)
|
|
||||||
return $ openQuotes <> T.strip content <> closeQuotes
|
|
||||||
|
|
||||||
-- the rule set
|
|
||||||
myRules :: RuleSet
|
|
||||||
myRules =
|
|
||||||
[ fullwidthCJKsymCJK,
|
|
||||||
fullwidthCJKsym,
|
|
||||||
dotsCJK,
|
|
||||||
fixCJKcolAN,
|
|
||||||
cjkquote,
|
|
||||||
quoteCJK,
|
|
||||||
fixQuote
|
|
||||||
]
|
|
||||||
242
src/Pangu.hs
Normal file
242
src/Pangu.hs
Normal file
@@ -0,0 +1,242 @@
|
|||||||
|
{-# LANGUAGE OverloadedStrings #-}
|
||||||
|
|
||||||
|
module Pangu (pangu, isCJK) where
|
||||||
|
|
||||||
|
import Data.Function (fix)
|
||||||
|
import Data.Text (Text)
|
||||||
|
import qualified Data.Text as T
|
||||||
|
import Data.Void (Void)
|
||||||
|
import Replace.Megaparsec (streamEdit)
|
||||||
|
import Text.Megaparsec
|
||||||
|
import Text.Megaparsec.Char
|
||||||
|
|
||||||
|
-------------------------------------------------------------------------------
|
||||||
|
type Parser = Parsec Void Text
|
||||||
|
|
||||||
|
type Rule = Parser Text
|
||||||
|
|
||||||
|
type RuleSet = [Rule]
|
||||||
|
|
||||||
|
applyUntilFixed :: Rule -> Text -> Text
|
||||||
|
applyUntilFixed rule =
|
||||||
|
fix
|
||||||
|
( \loop current ->
|
||||||
|
let next = streamEdit (try rule) id current
|
||||||
|
in if next == current then next else loop next
|
||||||
|
)
|
||||||
|
|
||||||
|
applyRulesRecursively :: RuleSet -> Text -> Text
|
||||||
|
applyRulesRecursively rules input = foldl (flip applyUntilFixed) input rules
|
||||||
|
|
||||||
|
applyRules :: RuleSet -> Text -> Text
|
||||||
|
applyRules rules input = foldl (flip applyOnce) input rules
|
||||||
|
where
|
||||||
|
applyOnce rule = streamEdit (try rule) id
|
||||||
|
|
||||||
|
-------------------------------------------------------------------------------
|
||||||
|
-- rules for pangu
|
||||||
|
|
||||||
|
-- alphaNumChar from megaparsec matches CJK chars...
|
||||||
|
-- need to implement a new one
|
||||||
|
alphanumericChar :: Parser Char
|
||||||
|
alphanumericChar = satisfy $ \c ->
|
||||||
|
(c >= 'a' && c <= 'z')
|
||||||
|
|| (c >= 'A' && c <= 'Z')
|
||||||
|
|| (c >= '0' && c <= '9')
|
||||||
|
|
||||||
|
-- | Check if a character falls within the CJK ranges provided
|
||||||
|
isCJK :: Char -> Bool
|
||||||
|
isCJK c = any (\(start, end) -> c >= start && c <= end) cjkRanges
|
||||||
|
where
|
||||||
|
cjkRanges =
|
||||||
|
[ ('\x2e80', '\x2eff'),
|
||||||
|
('\x2f00', '\x2fdf'),
|
||||||
|
('\x3040', '\x309f'),
|
||||||
|
('\x30a0', '\x30fa'),
|
||||||
|
('\x30fc', '\x30ff'),
|
||||||
|
('\x3100', '\x312f'),
|
||||||
|
('\x3200', '\x32ff'),
|
||||||
|
('\x3400', '\x4dbf'),
|
||||||
|
('\x4e00', '\x9fff'),
|
||||||
|
('\xf900', '\xfaff')
|
||||||
|
]
|
||||||
|
|
||||||
|
convertToFullwidth :: Char -> Char
|
||||||
|
convertToFullwidth c =
|
||||||
|
case c of
|
||||||
|
':' -> ':'
|
||||||
|
'.' -> '。'
|
||||||
|
'~' -> '~'
|
||||||
|
'!' -> '!'
|
||||||
|
'?' -> '?'
|
||||||
|
',' -> ','
|
||||||
|
';' -> ';'
|
||||||
|
'\"' -> '”'
|
||||||
|
'\'' -> '’'
|
||||||
|
_ -> c
|
||||||
|
|
||||||
|
-- A parser that matches a single CJK character
|
||||||
|
cjkChar :: Parser Char
|
||||||
|
cjkChar = satisfy isCJK
|
||||||
|
|
||||||
|
-- use python.py as reference for these rules
|
||||||
|
|
||||||
|
fullwidthCJKsymCJK :: Rule
|
||||||
|
fullwidthCJKsymCJK = do
|
||||||
|
lcjk <- cjkChar
|
||||||
|
_ <- many (char ' ')
|
||||||
|
sym <- try (some (char ':')) <|> count 1 (char '.')
|
||||||
|
_ <- many (char ' ')
|
||||||
|
rcjk <- cjkChar
|
||||||
|
let transformedsym = map convertToFullwidth sym
|
||||||
|
return $ T.pack $ [lcjk] ++ transformedsym ++ [rcjk]
|
||||||
|
|
||||||
|
fullwidthCJKsym :: Rule
|
||||||
|
fullwidthCJKsym = do
|
||||||
|
cjk <- cjkChar
|
||||||
|
_ <- many (char ' ')
|
||||||
|
sym <- some $ oneOf ("~!?,;" :: [Char])
|
||||||
|
_ <- many (char ' ')
|
||||||
|
let transformedsym = T.pack $ map convertToFullwidth sym
|
||||||
|
return $ T.pack [cjk] <> transformedsym
|
||||||
|
|
||||||
|
dotsCJK :: Rule
|
||||||
|
dotsCJK = do
|
||||||
|
dots <- chunk "..." <|> chunk "…"
|
||||||
|
cjk <- cjkChar
|
||||||
|
return $ dots <> T.pack (" " ++ [cjk])
|
||||||
|
|
||||||
|
fixCJKcolAN :: Rule
|
||||||
|
fixCJKcolAN = do
|
||||||
|
cjk <- cjkChar
|
||||||
|
_ <- char ':'
|
||||||
|
an <- alphanumericChar
|
||||||
|
return $ T.pack $ [cjk] ++ ":" ++ [an]
|
||||||
|
|
||||||
|
-- quotes
|
||||||
|
-- seems confusing ...
|
||||||
|
quotesym :: [Char]
|
||||||
|
quotesym = "'`\x05f4\""
|
||||||
|
|
||||||
|
cjkquote :: Rule
|
||||||
|
cjkquote = do
|
||||||
|
cjk <- cjkChar
|
||||||
|
quote <- oneOf quotesym
|
||||||
|
return $ T.pack $ [cjk] ++ " " ++ [quote]
|
||||||
|
|
||||||
|
quoteCJK :: Rule
|
||||||
|
quoteCJK = do
|
||||||
|
quote <- oneOf quotesym
|
||||||
|
cjk <- cjkChar
|
||||||
|
return $ T.pack $ [quote] ++ " " ++ [cjk]
|
||||||
|
|
||||||
|
fixQuote :: Rule
|
||||||
|
fixQuote = do
|
||||||
|
openQuotes <- T.pack <$> some (oneOf quotesym)
|
||||||
|
_ <- many spaceChar
|
||||||
|
content <- T.pack <$> someTill anySingle (lookAhead $ some (oneOf quotesym))
|
||||||
|
closeQuotes <- T.pack <$> some (oneOf quotesym)
|
||||||
|
return $ openQuotes <> T.strip content <> closeQuotes
|
||||||
|
|
||||||
|
cjkpossessivequote :: Rule
|
||||||
|
cjkpossessivequote = do
|
||||||
|
cjk <- cjkChar
|
||||||
|
_ <- char '\''
|
||||||
|
_ <- lookAhead $ anySingleBut 's'
|
||||||
|
return $ T.pack $ cjk : " '"
|
||||||
|
|
||||||
|
-- This singlequoteCJK rule will turn '你好' into ' 你好'
|
||||||
|
-- which seems not desirable...
|
||||||
|
-- however, the behavior is aligned with python version
|
||||||
|
singlequoteCJK :: Rule
|
||||||
|
singlequoteCJK = do
|
||||||
|
_ <- char '\''
|
||||||
|
cjk <- cjkChar
|
||||||
|
return $ T.pack $ "' " ++ [cjk]
|
||||||
|
|
||||||
|
fixPossessivequote :: Rule
|
||||||
|
fixPossessivequote = do
|
||||||
|
pre <- cjkChar <|> alphanumericChar
|
||||||
|
_ <- some spaceChar
|
||||||
|
_ <- chunk "'s"
|
||||||
|
return $ T.pack $ pre : "'s"
|
||||||
|
|
||||||
|
-- hash
|
||||||
|
hashANSCJKhash :: Rule
|
||||||
|
hashANSCJKhash = do
|
||||||
|
cjk1 <- cjkChar
|
||||||
|
_ <- char '#'
|
||||||
|
mid <- some cjkChar
|
||||||
|
_ <- char '#'
|
||||||
|
cjk2 <- cjkChar
|
||||||
|
return $ T.pack $ [cjk1] ++ " #" ++ mid ++ "# " ++ [cjk2]
|
||||||
|
|
||||||
|
cjkhash :: Rule
|
||||||
|
cjkhash = do
|
||||||
|
cjk <- cjkChar
|
||||||
|
_ <- char '#'
|
||||||
|
_ <- lookAhead $ anySingleBut ' '
|
||||||
|
return $ T.pack $ cjk : " #"
|
||||||
|
|
||||||
|
hashcjk :: Rule
|
||||||
|
hashcjk = do
|
||||||
|
_ <- char '#'
|
||||||
|
_ <- lookAhead $ anySingleBut ' '
|
||||||
|
cjk <- cjkChar
|
||||||
|
return $ T.pack $ "# " ++ [cjk]
|
||||||
|
|
||||||
|
-- operators
|
||||||
|
cjkOPTan :: Rule
|
||||||
|
cjkOPTan = do
|
||||||
|
cjk <- cjkChar
|
||||||
|
opt <- oneOf ("+-=*/&|<>%" :: [Char])
|
||||||
|
an <- alphanumericChar
|
||||||
|
return $ T.pack [cjk, ' ', opt, ' ', an]
|
||||||
|
|
||||||
|
anOPTcjk :: Rule
|
||||||
|
anOPTcjk = do
|
||||||
|
an <- alphanumericChar
|
||||||
|
opt <- oneOf ("+-=*/&|<>%" :: [Char])
|
||||||
|
cjk <- cjkChar
|
||||||
|
return $ T.pack [an, ' ', opt, ' ', cjk]
|
||||||
|
|
||||||
|
-- slash/bracket rules are not implemented
|
||||||
|
|
||||||
|
-- CJK and alphanumeric without space
|
||||||
|
|
||||||
|
cjkans :: Rule
|
||||||
|
cjkans = do
|
||||||
|
cjk <- cjkChar
|
||||||
|
_ <- lookAhead (alphanumericChar <|> oneOf ("@$%^&*-+\\=|/" :: [Char]))
|
||||||
|
return $ T.pack [cjk, ' ']
|
||||||
|
|
||||||
|
anscjk :: Rule
|
||||||
|
anscjk = do
|
||||||
|
an <- alphanumericChar <|> oneOf ("~!$%^&*-+\\=|;:,./?" :: [Char])
|
||||||
|
_ <- lookAhead cjkChar
|
||||||
|
return $ T.pack [an, ' ']
|
||||||
|
|
||||||
|
-- rule set, the order matters
|
||||||
|
recursiveRules :: RuleSet
|
||||||
|
recursiveRules = [fullwidthCJKsymCJK, fullwidthCJKsym]
|
||||||
|
|
||||||
|
onepassRules :: RuleSet
|
||||||
|
onepassRules =
|
||||||
|
[ dotsCJK,
|
||||||
|
fixCJKcolAN,
|
||||||
|
cjkquote,
|
||||||
|
quoteCJK,
|
||||||
|
fixQuote,
|
||||||
|
cjkpossessivequote,
|
||||||
|
-- singlequoteCJK,
|
||||||
|
fixPossessivequote,
|
||||||
|
hashANSCJKhash,
|
||||||
|
cjkhash,
|
||||||
|
-- hashcjk,
|
||||||
|
anscjk,
|
||||||
|
cjkans,
|
||||||
|
empty -- a dummy rule
|
||||||
|
]
|
||||||
|
|
||||||
|
pangu :: Text -> Text
|
||||||
|
pangu input = applyRules onepassRules $ applyRulesRecursively recursiveRules input
|
||||||
33
test/Main.hs
33
test/Main.hs
@@ -1,21 +1,28 @@
|
|||||||
{-# LANGUAGE OverloadedStrings #-}
|
{-# LANGUAGE OverloadedStrings #-}
|
||||||
module Main (main) where
|
module Main (main) where
|
||||||
import MyLib
|
import Pangu
|
||||||
import Test.Hspec
|
import Test.Hspec
|
||||||
|
|
||||||
|
|
||||||
main :: IO ()
|
main :: IO ()
|
||||||
main = hspec $ do
|
main = hspec $ do
|
||||||
describe "MyLib.cjksym(cjk)" $ do
|
describe "Pangu.cjksym(cjk)" $ do
|
||||||
it "converts symbols to fullwidth" $ do
|
it "converts symbols to fullwidth" $ do
|
||||||
applyRules myRules "你 : 好" `shouldBe` "你:好"
|
pangu "你 : 好" `shouldBe` "你:好"
|
||||||
applyRules myRules "你.好" `shouldBe` "你。好"
|
pangu "你.好" `shouldBe` "你。好"
|
||||||
applyRules myRules "你:好:他" `shouldBe` "你:好:他"
|
pangu "你:好:他" `shouldBe` "你:好:他"
|
||||||
applyRules myRules "你 ? 好" `shouldBe` "你?好"
|
pangu "你 ? 好" `shouldBe` "你?好"
|
||||||
applyRules myRules "你…好" `shouldBe` "你… 好"
|
pangu "你…好" `shouldBe` "你… 好"
|
||||||
applyRules myRules "你...好" `shouldBe` "你... 好"
|
pangu "你...好" `shouldBe` "你... 好"
|
||||||
applyRules myRules "你:0" `shouldBe` "你:0"
|
pangu "你:0" `shouldBe` "你:0"
|
||||||
applyRules myRules "我说:\" 他说:\'你好\'\"" `shouldBe` "我说:\"他说:\'你好\'\""
|
it "fixes quotes" $ do
|
||||||
it "adds spaces" $ do
|
pangu "我说:\" 他说:'你好'\"" `shouldBe` "我说:\"他说:' 你好 '\""
|
||||||
applyRules myRules "\'你好\'" `shouldBe` "\'你好\'"
|
-- pangu "'你好'" `shouldBe` "' 你好'" -- strange behavior
|
||||||
applyRules myRules "你\'hello\'" `shouldBe` "你 \'hello\'"
|
pangu "你'hello'" `shouldBe` "你 'hello'"
|
||||||
|
pangu "我 's " `shouldBe` "我's "
|
||||||
|
it "fixes hash" $ do
|
||||||
|
pangu "你好#测试#世界" `shouldBe` "你好 #测试# 世界"
|
||||||
|
it "add spaces" $ do
|
||||||
|
pangu "你好and世界" `shouldBe` "你好 and 世界"
|
||||||
|
pangu "當你凝視著bug,bug也凝視著你" `shouldBe` "當你凝視著 bug,bug 也凝視著你"
|
||||||
|
pangu "與PM戰鬥的人,應當小心自己不要成為PM" `shouldBe` "與 PM 戰鬥的人,應當小心自己不要成為 PM"
|
||||||
Reference in New Issue
Block a user