diff --git a/pangu.py b/pangu.py deleted file mode 100644 index f1a3cb7..0000000 --- a/pangu.py +++ /dev/null @@ -1,191 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 -""" -Paranoid text spacing for good readability, to automatically insert whitespace between CJK (Chinese, Japanese, Korean) and half-width characters (alphabetical letters, numerical digits and symbols). - ->>> import pangu ->>> nwe_text = pangu.spacing_text('當你凝視著bug,bug也凝視著你') ->>> print(nwe_text) -'當你凝視著 bug,bug 也凝視著你' ->>> nwe_content = pangu.spacing_file('path/to/file.txt') ->>> print(nwe_content) -'與 PM 戰鬥的人,應當小心自己不要成為 PM' -""" - -import argparse -import os -import re -import sys - -__version__ = '4.0.6.1' -__all__ = ['spacing_text', 'spacing_file', 'spacing', 'cli'] - -CJK = r'\u2e80-\u2eff\u2f00-\u2fdf\u3040-\u309f\u30a0-\u30fa\u30fc-\u30ff\u3100-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff' - -ANY_CJK = re.compile(r'[{CJK}]'.format(CJK=CJK)) - -CONVERT_TO_FULLWIDTH_CJK_SYMBOLS_CJK = re.compile('([{CJK}])([ ]*(?:[\\:]+|\\.)[ ]*)([{CJK}])'.format(CJK=CJK)) # there is an extra non-capturing group compared to JavaScript version -CONVERT_TO_FULLWIDTH_CJK_SYMBOLS = re.compile('([{CJK}])[ ]*([~\\!;,\\?]+)[ ]*'.format(CJK=CJK)) -DOTS_CJK = re.compile('([\\.]{{2,}}|\u2026)([{CJK}])'.format(CJK=CJK)) # need to escape { } -FIX_CJK_COLON_ANS = re.compile('([{CJK}])\\:([A-Z0-9\\(\\)])'.format(CJK=CJK)) - -CJK_QUOTE = re.compile('([{CJK}])([`"\u05f4])'.format(CJK=CJK)) # no need to escape ` -QUOTE_CJK = re.compile('([`"\u05f4])([{CJK}])'.format(CJK=CJK)) # no need to escape ` -FIX_QUOTE_ANY_QUOTE = re.compile(r'([`"\u05f4]+)(\s*)(.+?)(\s*)([`"\u05f4]+)') - -CJK_SINGLE_QUOTE_BUT_POSSESSIVE = re.compile("([{CJK}])('[^s])".format(CJK=CJK)) -SINGLE_QUOTE_CJK = re.compile("(')([{CJK}])".format(CJK=CJK)) -FIX_POSSESSIVE_SINGLE_QUOTE = re.compile("([{CJK}A-Za-z0-9])( )('s)".format(CJK=CJK)) - -HASH_ANS_CJK_HASH = re.compile('([{CJK}])(#)([{CJK}]+)(#)([{CJK}])'.format(CJK=CJK)) -CJK_HASH = re.compile('([{CJK}])(#([^ ]))'.format(CJK=CJK)) -HASH_CJK = re.compile('(([^ ])#)([{CJK}])'.format(CJK=CJK)) - -CJK_OPERATOR_ANS = re.compile('([{CJK}])([\\+\\-\\*\\/=&\\|<>])([A-Za-z0-9])'.format(CJK=CJK)) -ANS_OPERATOR_CJK = re.compile('([A-Za-z0-9])([\\+\\-\\*\\/=&\\|<>])([{CJK}])'.format(CJK=CJK)) - -FIX_SLASH_AS = re.compile(r'([/]) ([a-z\-_\./]+)') -FIX_SLASH_AS_SLASH = re.compile(r'([/\.])([A-Za-z\-_\./]+) ([/])') - -CJK_LEFT_BRACKET = re.compile('([{CJK}])([\\(\\[\\{{<>\u201c])'.format(CJK=CJK)) # need to escape { -RIGHT_BRACKET_CJK = re.compile('([\\)\\]\\}}<>\u201d])([{CJK}])'.format(CJK=CJK)) # need to escape } -FIX_LEFT_BRACKET_ANY_RIGHT_BRACKET = re.compile(r'([\(\[\{<\u201c]+)(\s*)(.+?)(\s*)([\)\]\}>\u201d]+)') # need to escape { } -ANS_CJK_LEFT_BRACKET_ANY_RIGHT_BRACKET = re.compile('([A-Za-z0-9{CJK}])[ ]*([\u201c])([A-Za-z0-9{CJK}\\-_ ]+)([\u201d])'.format(CJK=CJK)) -LEFT_BRACKET_ANY_RIGHT_BRACKET_ANS_CJK = re.compile('([\u201c])([A-Za-z0-9{CJK}\\-_ ]+)([\u201d])[ ]*([A-Za-z0-9{CJK}])'.format(CJK=CJK)) - -AN_LEFT_BRACKET = re.compile(r'([A-Za-z0-9])([\(\[\{])') -RIGHT_BRACKET_AN = re.compile(r'([\)\]\}])([A-Za-z0-9])') - -CJK_ANS = re.compile('([{CJK}])([A-Za-z\u0370-\u03ff0-9@\\$%\\^&\\*\\-\\+\\\\=\\|/\u00a1-\u00ff\u2150-\u218f\u2700—\u27bf])'.format(CJK=CJK)) -ANS_CJK = re.compile('([A-Za-z\u0370-\u03ff0-9~\\!\\$%\\^&\\*\\-\\+\\\\=\\|;:,\\./\\?\u00a1-\u00ff\u2150-\u218f\u2700—\u27bf])([{CJK}])'.format(CJK=CJK)) - -S_A = re.compile(r'(%)([A-Za-z])') - -MIDDLE_DOT = re.compile(r'([ ]*)([\u00b7\u2022\u2027])([ ]*)') - -# Python version only -TILDES = re.compile(r'~+') -EXCLAMATION_MARKS = re.compile(r'!+') -SEMICOLONS = re.compile(r';+') -COLONS = re.compile(r':+') -COMMAS = re.compile(r',+') -PERIODS = re.compile(r'\.+') -QUESTION_MARKS = re.compile(r'\?+') - - -def convert_to_fullwidth(symbols): - symbols = TILDES.sub('~', symbols) - symbols = EXCLAMATION_MARKS.sub('!', symbols) - symbols = SEMICOLONS.sub(';', symbols) - symbols = COLONS.sub(':', symbols) - symbols = COMMAS.sub(',', symbols) - symbols = PERIODS.sub('。', symbols) - symbols = QUESTION_MARKS.sub('?', symbols) - return symbols.strip() - - -def spacing(text): - """ - Perform paranoid text spacing on text. - """ - if len(text) <= 1 or not ANY_CJK.search(text): - return text - - new_text = text - - # TODO: refactoring - matched = CONVERT_TO_FULLWIDTH_CJK_SYMBOLS_CJK.search(new_text) - while matched: - start, end = matched.span() - new_text = ''.join((new_text[:start + 1], convert_to_fullwidth(new_text[start + 1:end - 1]), new_text[end - 1:])) - matched = CONVERT_TO_FULLWIDTH_CJK_SYMBOLS_CJK.search(new_text) - - matched = CONVERT_TO_FULLWIDTH_CJK_SYMBOLS.search(new_text) - while matched: - start, end = matched.span() - new_text = ''.join((new_text[:start + 1].strip(), convert_to_fullwidth(new_text[start + 1:end]), new_text[end:].strip())) - matched = CONVERT_TO_FULLWIDTH_CJK_SYMBOLS.search(new_text) - - new_text = DOTS_CJK.sub(r'\1 \2', new_text) - new_text = FIX_CJK_COLON_ANS.sub(r'\1:\2', new_text) - - new_text = CJK_QUOTE.sub(r'\1 \2', new_text) - new_text = QUOTE_CJK.sub(r'\1 \2', new_text) - new_text = FIX_QUOTE_ANY_QUOTE.sub(r'\1\3\5', new_text) - - new_text = CJK_SINGLE_QUOTE_BUT_POSSESSIVE.sub(r'\1 \2', new_text) - new_text = SINGLE_QUOTE_CJK.sub(r'\1 \2', new_text) - new_text = FIX_POSSESSIVE_SINGLE_QUOTE.sub(r"\1's", new_text) - - new_text = HASH_ANS_CJK_HASH.sub(r'\1 \2\3\4 \5', new_text) - new_text = CJK_HASH.sub(r'\1 \2', new_text) - new_text = HASH_CJK.sub(r'\1 \3', new_text) - - new_text = CJK_OPERATOR_ANS.sub(r'\1 \2 \3', new_text) - new_text = ANS_OPERATOR_CJK.sub(r'\1 \2 \3', new_text) - - new_text = FIX_SLASH_AS.sub(r'\1\2', new_text) - new_text = FIX_SLASH_AS_SLASH.sub(r'\1\2\3', new_text) - - new_text = CJK_LEFT_BRACKET.sub(r'\1 \2', new_text) - new_text = RIGHT_BRACKET_CJK.sub(r'\1 \2', new_text) - new_text = FIX_LEFT_BRACKET_ANY_RIGHT_BRACKET.sub(r'\1\3\5', new_text) - new_text = ANS_CJK_LEFT_BRACKET_ANY_RIGHT_BRACKET.sub(r'\1 \2\3\4', new_text) - new_text = LEFT_BRACKET_ANY_RIGHT_BRACKET_ANS_CJK.sub(r'\1\2\3 \4', new_text) - - new_text = AN_LEFT_BRACKET.sub(r'\1 \2', new_text) - new_text = RIGHT_BRACKET_AN.sub(r'\1 \2', new_text) - - new_text = CJK_ANS.sub(r'\1 \2', new_text) - new_text = ANS_CJK.sub(r'\1 \2', new_text) - - new_text = S_A.sub(r'\1 \2', new_text) - - new_text = MIDDLE_DOT.sub('・', new_text) - - return new_text.strip() - - -def spacing_text(text): - """ - Perform paranoid text spacing on text. An alias of `spacing()`. - """ - return spacing(text) - - -def spacing_file(path): - """ - Perform paranoid text spacing from file. - """ - # TODO: read line by line - with open(os.path.abspath(path)) as f: - return spacing_text(f.read()) - - -def cli(args=None): - if not args: - args = sys.argv[1:] - - parser = argparse.ArgumentParser( - prog='pangu', - description='pangu.py -- Paranoid text spacing for good readability, to automatically insert whitespace between CJK and half-width characters (alphabetical letters, numerical digits and symbols).', - ) - parser.add_argument('-v', '--version', action='version', version=__version__) - parser.add_argument('-t', '--text', action='store_true', dest='is_text', required=False, help='specify the input value is a text') - parser.add_argument('-f', '--file', action='store_true', dest='is_file', required=False, help='specify the input value is a file path') - parser.add_argument('text_or_path', action='store', type=str, help='the text or file path to apply spacing') - - if not sys.stdin.isatty(): - print(spacing_text(sys.stdin.read())) # noqa: T003 - else: - args = parser.parse_args(args) - if args.is_text: - print(spacing_text(args.text_or_path)) # noqa: T003 - elif args.is_file: - print(spacing_file(args.text_or_path)) # noqa: T003 - else: - print(spacing_text(args.text_or_path)) # noqa: T003 - - -if __name__ == '__main__': - cli() \ No newline at end of file diff --git a/panguFilter.cabal b/panguFilter.cabal index 181782e..41be4c8 100644 --- a/panguFilter.cabal +++ b/panguFilter.cabal @@ -59,7 +59,7 @@ library import: warnings -- Modules exported by the library. - exposed-modules: MyLib + exposed-modules: Pangu -- Modules included in this library but not exported. -- other-modules: diff --git a/src/MyLib.hs b/src/Pangu.hs similarity index 98% rename from src/MyLib.hs rename to src/Pangu.hs index 57656b5..49c737e 100644 --- a/src/MyLib.hs +++ b/src/Pangu.hs @@ -1,6 +1,6 @@ {-# LANGUAGE OverloadedStrings #-} -module MyLib where +module Pangu (pangu, isCJK) where import Data.Function (fix) import Data.Text (Text) @@ -218,10 +218,7 @@ anscjk = do -- rule set, the order matters recursiveRules :: RuleSet -recursiveRules = - [ fullwidthCJKsymCJK, - fullwidthCJKsym - ] +recursiveRules = [fullwidthCJKsymCJK, fullwidthCJKsym] onepassRules :: RuleSet onepassRules = diff --git a/test/Main.hs b/test/Main.hs index 2b363cb..57e250d 100644 --- a/test/Main.hs +++ b/test/Main.hs @@ -1,12 +1,12 @@ {-# LANGUAGE OverloadedStrings #-} module Main (main) where -import MyLib +import Pangu import Test.Hspec main :: IO () main = hspec $ do - describe "MyLib.cjksym(cjk)" $ do + describe "Pangu.cjksym(cjk)" $ do it "converts symbols to fullwidth" $ do pangu "你 : 好" `shouldBe` "你:好" pangu "你.好" `shouldBe` "你。好"