commit 272b1a9a77b1f2484aa108d7f49dd3bf73a8da89 Author: Yu Cong Date: Fri Jan 2 15:26:19 2026 +0800 first commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9e84163 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +dist-newstyle/ +.ds_store diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..1006c9c --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,7 @@ +{ + "haskell.toolchain": { + "hls": "recommended", + "cabal": "recommended", + "stack": null + } +} \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..b37569c --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,5 @@ +# Revision history for panguFilter + +## 0.1.0.0 -- YYYY-mm-dd + +* First version. Released on an unsuspecting world. diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..f8e8956 --- /dev/null +++ b/LICENSE @@ -0,0 +1,29 @@ +Copyright (c) 2026, Yu Cong + + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/pangu.py b/pangu.py new file mode 100644 index 0000000..f1a3cb7 --- /dev/null +++ b/pangu.py @@ -0,0 +1,191 @@ +#!/usr/bin/env python +# coding: utf-8 +""" +Paranoid text spacing for good readability, to automatically insert whitespace between CJK (Chinese, Japanese, Korean) and half-width characters (alphabetical letters, numerical digits and symbols). + +>>> import pangu +>>> nwe_text = pangu.spacing_text('當你凝視著bug,bug也凝視著你') +>>> print(nwe_text) +'當你凝視著 bug,bug 也凝視著你' +>>> nwe_content = pangu.spacing_file('path/to/file.txt') +>>> print(nwe_content) +'與 PM 戰鬥的人,應當小心自己不要成為 PM' +""" + +import argparse +import os +import re +import sys + +__version__ = '4.0.6.1' +__all__ = ['spacing_text', 'spacing_file', 'spacing', 'cli'] + +CJK = r'\u2e80-\u2eff\u2f00-\u2fdf\u3040-\u309f\u30a0-\u30fa\u30fc-\u30ff\u3100-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff' + +ANY_CJK = re.compile(r'[{CJK}]'.format(CJK=CJK)) + +CONVERT_TO_FULLWIDTH_CJK_SYMBOLS_CJK = re.compile('([{CJK}])([ ]*(?:[\\:]+|\\.)[ ]*)([{CJK}])'.format(CJK=CJK)) # there is an extra non-capturing group compared to JavaScript version +CONVERT_TO_FULLWIDTH_CJK_SYMBOLS = re.compile('([{CJK}])[ ]*([~\\!;,\\?]+)[ ]*'.format(CJK=CJK)) +DOTS_CJK = re.compile('([\\.]{{2,}}|\u2026)([{CJK}])'.format(CJK=CJK)) # need to escape { } +FIX_CJK_COLON_ANS = re.compile('([{CJK}])\\:([A-Z0-9\\(\\)])'.format(CJK=CJK)) + +CJK_QUOTE = re.compile('([{CJK}])([`"\u05f4])'.format(CJK=CJK)) # no need to escape ` +QUOTE_CJK = re.compile('([`"\u05f4])([{CJK}])'.format(CJK=CJK)) # no need to escape ` +FIX_QUOTE_ANY_QUOTE = re.compile(r'([`"\u05f4]+)(\s*)(.+?)(\s*)([`"\u05f4]+)') + +CJK_SINGLE_QUOTE_BUT_POSSESSIVE = re.compile("([{CJK}])('[^s])".format(CJK=CJK)) +SINGLE_QUOTE_CJK = re.compile("(')([{CJK}])".format(CJK=CJK)) +FIX_POSSESSIVE_SINGLE_QUOTE = re.compile("([{CJK}A-Za-z0-9])( )('s)".format(CJK=CJK)) + +HASH_ANS_CJK_HASH = re.compile('([{CJK}])(#)([{CJK}]+)(#)([{CJK}])'.format(CJK=CJK)) +CJK_HASH = re.compile('([{CJK}])(#([^ ]))'.format(CJK=CJK)) +HASH_CJK = re.compile('(([^ ])#)([{CJK}])'.format(CJK=CJK)) + +CJK_OPERATOR_ANS = re.compile('([{CJK}])([\\+\\-\\*\\/=&\\|<>])([A-Za-z0-9])'.format(CJK=CJK)) +ANS_OPERATOR_CJK = re.compile('([A-Za-z0-9])([\\+\\-\\*\\/=&\\|<>])([{CJK}])'.format(CJK=CJK)) + +FIX_SLASH_AS = re.compile(r'([/]) ([a-z\-_\./]+)') +FIX_SLASH_AS_SLASH = re.compile(r'([/\.])([A-Za-z\-_\./]+) ([/])') + +CJK_LEFT_BRACKET = re.compile('([{CJK}])([\\(\\[\\{{<>\u201c])'.format(CJK=CJK)) # need to escape { +RIGHT_BRACKET_CJK = re.compile('([\\)\\]\\}}<>\u201d])([{CJK}])'.format(CJK=CJK)) # need to escape } +FIX_LEFT_BRACKET_ANY_RIGHT_BRACKET = re.compile(r'([\(\[\{<\u201c]+)(\s*)(.+?)(\s*)([\)\]\}>\u201d]+)') # need to escape { } +ANS_CJK_LEFT_BRACKET_ANY_RIGHT_BRACKET = re.compile('([A-Za-z0-9{CJK}])[ ]*([\u201c])([A-Za-z0-9{CJK}\\-_ ]+)([\u201d])'.format(CJK=CJK)) +LEFT_BRACKET_ANY_RIGHT_BRACKET_ANS_CJK = re.compile('([\u201c])([A-Za-z0-9{CJK}\\-_ ]+)([\u201d])[ ]*([A-Za-z0-9{CJK}])'.format(CJK=CJK)) + +AN_LEFT_BRACKET = re.compile(r'([A-Za-z0-9])([\(\[\{])') +RIGHT_BRACKET_AN = re.compile(r'([\)\]\}])([A-Za-z0-9])') + +CJK_ANS = re.compile('([{CJK}])([A-Za-z\u0370-\u03ff0-9@\\$%\\^&\\*\\-\\+\\\\=\\|/\u00a1-\u00ff\u2150-\u218f\u2700—\u27bf])'.format(CJK=CJK)) +ANS_CJK = re.compile('([A-Za-z\u0370-\u03ff0-9~\\!\\$%\\^&\\*\\-\\+\\\\=\\|;:,\\./\\?\u00a1-\u00ff\u2150-\u218f\u2700—\u27bf])([{CJK}])'.format(CJK=CJK)) + +S_A = re.compile(r'(%)([A-Za-z])') + +MIDDLE_DOT = re.compile(r'([ ]*)([\u00b7\u2022\u2027])([ ]*)') + +# Python version only +TILDES = re.compile(r'~+') +EXCLAMATION_MARKS = re.compile(r'!+') +SEMICOLONS = re.compile(r';+') +COLONS = re.compile(r':+') +COMMAS = re.compile(r',+') +PERIODS = re.compile(r'\.+') +QUESTION_MARKS = re.compile(r'\?+') + + +def convert_to_fullwidth(symbols): + symbols = TILDES.sub('~', symbols) + symbols = EXCLAMATION_MARKS.sub('!', symbols) + symbols = SEMICOLONS.sub(';', symbols) + symbols = COLONS.sub(':', symbols) + symbols = COMMAS.sub(',', symbols) + symbols = PERIODS.sub('。', symbols) + symbols = QUESTION_MARKS.sub('?', symbols) + return symbols.strip() + + +def spacing(text): + """ + Perform paranoid text spacing on text. + """ + if len(text) <= 1 or not ANY_CJK.search(text): + return text + + new_text = text + + # TODO: refactoring + matched = CONVERT_TO_FULLWIDTH_CJK_SYMBOLS_CJK.search(new_text) + while matched: + start, end = matched.span() + new_text = ''.join((new_text[:start + 1], convert_to_fullwidth(new_text[start + 1:end - 1]), new_text[end - 1:])) + matched = CONVERT_TO_FULLWIDTH_CJK_SYMBOLS_CJK.search(new_text) + + matched = CONVERT_TO_FULLWIDTH_CJK_SYMBOLS.search(new_text) + while matched: + start, end = matched.span() + new_text = ''.join((new_text[:start + 1].strip(), convert_to_fullwidth(new_text[start + 1:end]), new_text[end:].strip())) + matched = CONVERT_TO_FULLWIDTH_CJK_SYMBOLS.search(new_text) + + new_text = DOTS_CJK.sub(r'\1 \2', new_text) + new_text = FIX_CJK_COLON_ANS.sub(r'\1:\2', new_text) + + new_text = CJK_QUOTE.sub(r'\1 \2', new_text) + new_text = QUOTE_CJK.sub(r'\1 \2', new_text) + new_text = FIX_QUOTE_ANY_QUOTE.sub(r'\1\3\5', new_text) + + new_text = CJK_SINGLE_QUOTE_BUT_POSSESSIVE.sub(r'\1 \2', new_text) + new_text = SINGLE_QUOTE_CJK.sub(r'\1 \2', new_text) + new_text = FIX_POSSESSIVE_SINGLE_QUOTE.sub(r"\1's", new_text) + + new_text = HASH_ANS_CJK_HASH.sub(r'\1 \2\3\4 \5', new_text) + new_text = CJK_HASH.sub(r'\1 \2', new_text) + new_text = HASH_CJK.sub(r'\1 \3', new_text) + + new_text = CJK_OPERATOR_ANS.sub(r'\1 \2 \3', new_text) + new_text = ANS_OPERATOR_CJK.sub(r'\1 \2 \3', new_text) + + new_text = FIX_SLASH_AS.sub(r'\1\2', new_text) + new_text = FIX_SLASH_AS_SLASH.sub(r'\1\2\3', new_text) + + new_text = CJK_LEFT_BRACKET.sub(r'\1 \2', new_text) + new_text = RIGHT_BRACKET_CJK.sub(r'\1 \2', new_text) + new_text = FIX_LEFT_BRACKET_ANY_RIGHT_BRACKET.sub(r'\1\3\5', new_text) + new_text = ANS_CJK_LEFT_BRACKET_ANY_RIGHT_BRACKET.sub(r'\1 \2\3\4', new_text) + new_text = LEFT_BRACKET_ANY_RIGHT_BRACKET_ANS_CJK.sub(r'\1\2\3 \4', new_text) + + new_text = AN_LEFT_BRACKET.sub(r'\1 \2', new_text) + new_text = RIGHT_BRACKET_AN.sub(r'\1 \2', new_text) + + new_text = CJK_ANS.sub(r'\1 \2', new_text) + new_text = ANS_CJK.sub(r'\1 \2', new_text) + + new_text = S_A.sub(r'\1 \2', new_text) + + new_text = MIDDLE_DOT.sub('・', new_text) + + return new_text.strip() + + +def spacing_text(text): + """ + Perform paranoid text spacing on text. An alias of `spacing()`. + """ + return spacing(text) + + +def spacing_file(path): + """ + Perform paranoid text spacing from file. + """ + # TODO: read line by line + with open(os.path.abspath(path)) as f: + return spacing_text(f.read()) + + +def cli(args=None): + if not args: + args = sys.argv[1:] + + parser = argparse.ArgumentParser( + prog='pangu', + description='pangu.py -- Paranoid text spacing for good readability, to automatically insert whitespace between CJK and half-width characters (alphabetical letters, numerical digits and symbols).', + ) + parser.add_argument('-v', '--version', action='version', version=__version__) + parser.add_argument('-t', '--text', action='store_true', dest='is_text', required=False, help='specify the input value is a text') + parser.add_argument('-f', '--file', action='store_true', dest='is_file', required=False, help='specify the input value is a file path') + parser.add_argument('text_or_path', action='store', type=str, help='the text or file path to apply spacing') + + if not sys.stdin.isatty(): + print(spacing_text(sys.stdin.read())) # noqa: T003 + else: + args = parser.parse_args(args) + if args.is_text: + print(spacing_text(args.text_or_path)) # noqa: T003 + elif args.is_file: + print(spacing_file(args.text_or_path)) # noqa: T003 + else: + print(spacing_text(args.text_or_path)) # noqa: T003 + + +if __name__ == '__main__': + cli() \ No newline at end of file diff --git a/pangu.simple.js b/pangu.simple.js new file mode 100644 index 0000000..0d5b86b --- /dev/null +++ b/pangu.simple.js @@ -0,0 +1,689 @@ +/*! + * pangu.simple.js + * -------- + * @version: 1.0.5 + * @homepage: https://github.com/backrunner/pangu.simple.js + * @license: MIT + * @author: BackRunner + */ +(function webpackUniversalModuleDefinition(root, factory) { + if(typeof exports === 'object' && typeof module === 'object') + module.exports = factory(); + else if(typeof define === 'function' && define.amd) + define("pangu", [], factory); + else if(typeof exports === 'object') + exports["pangu"] = factory(); + else + root["pangu"] = factory(); +})(window, function() { +return +/******/ (function(modules) { // webpackBootstrap +/******/ // The module cache +/******/ var installedModules = {}; +/******/ +/******/ // The require function +/******/ function __webpack_require__(moduleId) { +/******/ +/******/ // Check if module is in cache +/******/ if(installedModules[moduleId]) { +/******/ return installedModules[moduleId].exports; +/******/ } +/******/ // Create a new module (and put it into the cache) +/******/ var module = installedModules[moduleId] = { +/******/ i: moduleId, +/******/ l: false, +/******/ exports: {} +/******/ }; +/******/ +/******/ // Execute the module function +/******/ modules[moduleId].call(module.exports, module, module.exports, __webpack_require__); +/******/ +/******/ // Flag the module as loaded +/******/ module.l = true; +/******/ +/******/ // Return the exports of the module +/******/ return module.exports; +/******/ } +/******/ +/******/ +/******/ // expose the modules object (__webpack_modules__) +/******/ __webpack_require__.m = modules; +/******/ +/******/ // expose the module cache +/******/ __webpack_require__.c = installedModules; +/******/ +/******/ // define getter function for harmony exports +/******/ __webpack_require__.d = function(exports, name, getter) { +/******/ if(!__webpack_require__.o(exports, name)) { +/******/ Object.defineProperty(exports, name, { enumerable: true, get: getter }); +/******/ } +/******/ }; +/******/ +/******/ // define __esModule on exports +/******/ __webpack_require__.r = function(exports) { +/******/ if(typeof Symbol !== 'undefined' && Symbol.toStringTag) { +/******/ Object.defineProperty(exports, Symbol.toStringTag, { value: 'Module' }); +/******/ } +/******/ Object.defineProperty(exports, '__esModule', { value: true }); +/******/ }; +/******/ +/******/ // create a fake namespace object +/******/ // mode & 1: value is a module id, require it +/******/ // mode & 2: merge all properties of value into the ns +/******/ // mode & 4: return value when already ns object +/******/ // mode & 8|1: behave like require +/******/ __webpack_require__.t = function(value, mode) { +/******/ if(mode & 1) value = __webpack_require__(value); +/******/ if(mode & 8) return value; +/******/ if((mode & 4) && typeof value === 'object' && value && value.__esModule) return value; +/******/ var ns = Object.create(null); +/******/ __webpack_require__.r(ns); +/******/ Object.defineProperty(ns, 'default', { enumerable: true, value: value }); +/******/ if(mode & 2 && typeof value != 'string') for(var key in value) __webpack_require__.d(ns, key, function(key) { return value[key]; }.bind(null, key)); +/******/ return ns; +/******/ }; +/******/ +/******/ // getDefaultExport function for compatibility with non-harmony modules +/******/ __webpack_require__.n = function(module) { +/******/ var getter = module && module.__esModule ? +/******/ function getDefault() { return module['default']; } : +/******/ function getModuleExports() { return module; }; +/******/ __webpack_require__.d(getter, 'a', getter); +/******/ return getter; +/******/ }; +/******/ +/******/ // Object.prototype.hasOwnProperty.call +/******/ __webpack_require__.o = function(object, property) { return Object.prototype.hasOwnProperty.call(object, property); }; +/******/ +/******/ // __webpack_public_path__ +/******/ __webpack_require__.p = ""; +/******/ +/******/ +/******/ // Load entry module and return exports +/******/ return __webpack_require__(__webpack_require__.s = 0); +/******/ }) +/************************************************************************/ +/******/ ([ +/* 0 */ +/***/ (function(module, exports, __webpack_require__) { + +var __WEBPACK_AMD_DEFINE_FACTORY__, __WEBPACK_AMD_DEFINE_ARRAY__, __WEBPACK_AMD_DEFINE_RESULT__;function _typeof(obj) { "@babel/helpers - typeof"; if (typeof Symbol === "function" && typeof Symbol.iterator === "symbol") { _typeof = function _typeof(obj) { return typeof obj; }; } else { _typeof = function _typeof(obj) { return obj && typeof Symbol === "function" && obj.constructor === Symbol && obj !== Symbol.prototype ? "symbol" : typeof obj; }; } return _typeof(obj); } + +(function (global, factory) { + if (true) { + !(__WEBPACK_AMD_DEFINE_ARRAY__ = [], __WEBPACK_AMD_DEFINE_FACTORY__ = (factory), + __WEBPACK_AMD_DEFINE_RESULT__ = (typeof __WEBPACK_AMD_DEFINE_FACTORY__ === 'function' ? + (__WEBPACK_AMD_DEFINE_FACTORY__.apply(exports, __WEBPACK_AMD_DEFINE_ARRAY__)) : __WEBPACK_AMD_DEFINE_FACTORY__), + __WEBPACK_AMD_DEFINE_RESULT__ !== undefined && (module.exports = __WEBPACK_AMD_DEFINE_RESULT__)); + } else { var mod; } +})(typeof globalThis !== "undefined" ? globalThis : typeof self !== "undefined" ? self : this, function () { + "use strict"; + + function _classCallCheck(instance, Constructor) { if (!(instance instanceof Constructor)) { throw new TypeError("Cannot call a class as a function"); } } + + function _defineProperties(target, props) { for (var i = 0; i < props.length; i++) { var descriptor = props[i]; descriptor.enumerable = descriptor.enumerable || false; descriptor.configurable = true; if ("value" in descriptor) descriptor.writable = true; Object.defineProperty(target, descriptor.key, descriptor); } } + + function _createClass(Constructor, protoProps, staticProps) { if (protoProps) _defineProperties(Constructor.prototype, protoProps); if (staticProps) _defineProperties(Constructor, staticProps); return Constructor; } + + function _inherits(subClass, superClass) { if (typeof superClass !== "function" && superClass !== null) { throw new TypeError("Super expression must either be null or a function"); } subClass.prototype = Object.create(superClass && superClass.prototype, { constructor: { value: subClass, writable: true, configurable: true } }); if (superClass) _setPrototypeOf(subClass, superClass); } + + function _setPrototypeOf(o, p) { _setPrototypeOf = Object.setPrototypeOf || function _setPrototypeOf(o, p) { o.__proto__ = p; return o; }; return _setPrototypeOf(o, p); } + + function _createSuper(Derived) { var hasNativeReflectConstruct = _isNativeReflectConstruct(); return function _createSuperInternal() { var Super = _getPrototypeOf(Derived), result; if (hasNativeReflectConstruct) { var NewTarget = _getPrototypeOf(this).constructor; result = Reflect.construct(Super, arguments, NewTarget); } else { result = Super.apply(this, arguments); } return _possibleConstructorReturn(this, result); }; } + + function _possibleConstructorReturn(self, call) { if (call && (_typeof(call) === "object" || typeof call === "function")) { return call; } return _assertThisInitialized(self); } + + function _assertThisInitialized(self) { if (self === void 0) { throw new ReferenceError("this hasn't been initialised - super() hasn't been called"); } return self; } + + function _isNativeReflectConstruct() { if (typeof Reflect === "undefined" || !Reflect.construct) return false; if (Reflect.construct.sham) return false; if (typeof Proxy === "function") return true; try { Boolean.prototype.valueOf.call(Reflect.construct(Boolean, [], function () {})); return true; } catch (e) { return false; } } + + function _getPrototypeOf(o) { _getPrototypeOf = Object.setPrototypeOf ? Object.getPrototypeOf : function _getPrototypeOf(o) { return o.__proto__ || Object.getPrototypeOf(o); }; return _getPrototypeOf(o); } + + var _require = __webpack_require__(1), + Pangu = _require.Pangu; + + function once(func) { + var _arguments = arguments, + _this = this; + + var executed = false; + return function () { + if (executed) { + return; + } + + var self = _this; + executed = true; + func.apply(self, _arguments); + }; + } + + function debounce(func, delay, mustRunDelay) { + var _arguments2 = arguments, + _this2 = this; + + var timer = null; + var startTime = null; + return function () { + var self = _this2; + var args = _arguments2; + var currentTime = +new Date(); + clearTimeout(timer); + + if (!startTime) { + startTime = currentTime; + } + + if (currentTime - startTime >= mustRunDelay) { + func.apply(self, args); + startTime = currentTime; + } else { + timer = setTimeout(function () { + func.apply(self, args); + }, delay); + } + }; + } + + var BrowserPangu = function (_Pangu) { + _inherits(BrowserPangu, _Pangu); + + var _super = _createSuper(BrowserPangu); + + function BrowserPangu() { + var _this3; + + _classCallCheck(this, BrowserPangu); + + _this3 = _super.call(this); + _this3.blockTags = /^(div|p|h1|h2|h3|h4|h5|h6)$/i; + _this3.ignoredTags = /^(script|code|pre|textarea)$/i; + _this3.presentationalTags = /^(b|code|del|em|i|s|strong|kbd)$/i; + _this3.spaceLikeTags = /^(br|hr|i|img|pangu)$/i; + _this3.spaceSensitiveTags = /^(a|del|pre|s|strike|u)$/i; + _this3.isAutoSpacingPageExecuted = false; + return _this3; + } + + _createClass(BrowserPangu, [{ + key: "isContentEditable", + value: function isContentEditable(node) { + return node.isContentEditable || node.getAttribute && node.getAttribute('g_editable') === 'true'; + } + }, { + key: "isSpecificTag", + value: function isSpecificTag(node, tagRegex) { + return node && node.nodeName && node.nodeName.search(tagRegex) >= 0; + } + }, { + key: "isInsideSpecificTag", + value: function isInsideSpecificTag(node, tagRegex) { + var checkCurrent = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : false; + var currentNode = node; + + if (checkCurrent) { + if (this.isSpecificTag(currentNode, tagRegex)) { + return true; + } + } + + while (currentNode.parentNode) { + currentNode = currentNode.parentNode; + + if (this.isSpecificTag(currentNode, tagRegex)) { + return true; + } + } + + return false; + } + }, { + key: "canIgnoreNode", + value: function canIgnoreNode(node) { + var currentNode = node; + + if (currentNode && (this.isSpecificTag(currentNode, this.ignoredTags) || this.isContentEditable(currentNode))) { + return true; + } + + while (currentNode.parentNode) { + currentNode = currentNode.parentNode; + + if (currentNode && (this.isSpecificTag(currentNode, this.ignoredTags) || this.isContentEditable(currentNode))) { + return true; + } + } + + return false; + } + }, { + key: "isFirstTextChild", + value: function isFirstTextChild(parentNode, targetNode) { + var childNodes = parentNode.childNodes; + + for (var i = 0; i < childNodes.length; i++) { + var childNode = childNodes[i]; + + if (childNode.nodeType !== Node.COMMENT_NODE && childNode.textContent) { + return childNode === targetNode; + } + } + + return false; + } + }, { + key: "isLastTextChild", + value: function isLastTextChild(parentNode, targetNode) { + var childNodes = parentNode.childNodes; + + for (var i = childNodes.length - 1; i > -1; i--) { + var childNode = childNodes[i]; + + if (childNode.nodeType !== Node.COMMENT_NODE && childNode.textContent) { + return childNode === targetNode; + } + } + + return false; + } + }, { + key: "spacingNodeByXPath", + value: function spacingNodeByXPath(xPathQuery, contextNode) { + if (!(contextNode instanceof Node) || contextNode instanceof DocumentFragment) { + return; + } + + var textNodes = document.evaluate(xPathQuery, contextNode, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null); + var currentTextNode; + var nextTextNode; + + for (var i = textNodes.snapshotLength - 1; i > -1; --i) { + currentTextNode = textNodes.snapshotItem(i); + + if (this.isSpecificTag(currentTextNode.parentNode, this.presentationalTags) && !this.isInsideSpecificTag(currentTextNode.parentNode, this.ignoredTags)) { + var elementNode = currentTextNode.parentNode; + + if (elementNode.previousSibling) { + var previousSibling = elementNode.previousSibling; + + if (previousSibling.nodeType === Node.TEXT_NODE) { + var testText = previousSibling.data.substr(-1) + currentTextNode.data.toString().charAt(0); + var testNewText = this.spacing(testText); + + if (testText !== testNewText) { + previousSibling.data = "".concat(previousSibling.data, " "); + } + } + } + + if (elementNode.nextSibling) { + var nextSibling = elementNode.nextSibling; + + if (nextSibling.nodeType === Node.TEXT_NODE) { + var _testText = currentTextNode.data.substr(-1) + nextSibling.data.toString().charAt(0); + + var _testNewText = this.spacing(_testText); + + if (_testText !== _testNewText) { + nextSibling.data = " ".concat(nextSibling.data); + } + } + } + } + + if (this.canIgnoreNode(currentTextNode)) { + nextTextNode = currentTextNode; + continue; + } + + var newText = this.spacing(currentTextNode.data); + + if (currentTextNode.data !== newText) { + currentTextNode.data = newText; + } + + if (nextTextNode) { + if (currentTextNode.nextSibling && currentTextNode.nextSibling.nodeName.search(this.spaceLikeTags) >= 0) { + nextTextNode = currentTextNode; + continue; + } + + var _testText2 = currentTextNode.data.toString().substr(-1) + nextTextNode.data.toString().substr(0, 1); + + var _testNewText2 = this.spacing(_testText2); + + if (_testNewText2 !== _testText2) { + var nextNode = nextTextNode; + + while (nextNode.parentNode && nextNode.nodeName.search(this.spaceSensitiveTags) === -1 && this.isFirstTextChild(nextNode.parentNode, nextNode)) { + nextNode = nextNode.parentNode; + } + + var currentNode = currentTextNode; + + while (currentNode.parentNode && currentNode.nodeName.search(this.spaceSensitiveTags) === -1 && this.isLastTextChild(currentNode.parentNode, currentNode)) { + currentNode = currentNode.parentNode; + } + + if (currentNode.nextSibling) { + if (currentNode.nextSibling.nodeName.search(this.spaceLikeTags) >= 0) { + nextTextNode = currentTextNode; + continue; + } + } + + if (currentNode.nodeName.search(this.blockTags) === -1) { + if (nextNode.nodeName.search(this.spaceSensitiveTags) === -1) { + if (nextNode.nodeName.search(this.ignoredTags) === -1 && nextNode.nodeName.search(this.blockTags) === -1) { + if (nextTextNode.previousSibling) { + if (nextTextNode.previousSibling.nodeName.search(this.spaceLikeTags) === -1) { + nextTextNode.data = " ".concat(nextTextNode.data); + } + } else { + if (!this.canIgnoreNode(nextTextNode)) { + nextTextNode.data = " ".concat(nextTextNode.data); + } + } + } + } else if (currentNode.nodeName.search(this.spaceSensitiveTags) === -1) { + currentTextNode.data = "".concat(currentTextNode.data, " "); + } else { + var panguSpace = document.createElement('pangu'); + panguSpace.innerHTML = ' '; + + if (nextNode.previousSibling) { + if (nextNode.previousSibling.nodeName.search(this.spaceLikeTags) === -1) { + nextNode.parentNode.insertBefore(panguSpace, nextNode); + } + } else { + nextNode.parentNode.insertBefore(panguSpace, nextNode); + } + + if (!panguSpace.previousElementSibling) { + if (panguSpace.parentNode) { + panguSpace.parentNode.removeChild(panguSpace); + } + } + } + } + } + } + + nextTextNode = currentTextNode; + } + } + }, { + key: "spacingNode", + value: function spacingNode(contextNode) { + var xPathQuery = './/*/text()[normalize-space(.)]'; + + if (contextNode.children && contextNode.children.length === 0) { + xPathQuery = './/text()[normalize-space(.)]'; + } + + this.spacingNodeByXPath(xPathQuery, contextNode); + } + }, { + key: "spacingElementById", + value: function spacingElementById(idName) { + var xPathQuery = "id(\"".concat(idName, "\")//text()"); + this.spacingNodeByXPath(xPathQuery, document); + } + }, { + key: "spacingElementByClassName", + value: function spacingElementByClassName(className) { + var xPathQuery = "//*[contains(concat(\" \", normalize-space(@class), \" \"), \"".concat(className, "\")]//text()"); + this.spacingNodeByXPath(xPathQuery, document); + } + }, { + key: "spacingElementByTagName", + value: function spacingElementByTagName(tagName) { + var xPathQuery = "//".concat(tagName, "//text()"); + this.spacingNodeByXPath(xPathQuery, document); + } + }, { + key: "spacingPageTitle", + value: function spacingPageTitle() { + var xPathQuery = '/html/head/title/text()'; + this.spacingNodeByXPath(xPathQuery, document); + } + }, { + key: "spacingPageBody", + value: function spacingPageBody() { + var xPathQuery = '/html/body//*/text()[normalize-space(.)]'; + ['script', 'style', 'textarea'].forEach(function (tag) { + xPathQuery = "".concat(xPathQuery, "[translate(name(..),\"ABCDEFGHIJKLMNOPQRSTUVWXYZ\",\"abcdefghijklmnopqrstuvwxyz\")!=\"").concat(tag, "\"]"); + }); + this.spacingNodeByXPath(xPathQuery, document); + } + }, { + key: "spacingPage", + value: function spacingPage() { + this.spacingPageTitle(); + this.spacingPageBody(); + } + }, { + key: "autoSpacingPage", + value: function autoSpacingPage() { + var pageDelay = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : 1000; + var nodeDelay = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : 500; + var nodeMaxWait = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : 2000; + + if (!(document.body instanceof Node)) { + return; + } + + if (this.isAutoSpacingPageExecuted) { + return; + } + + this.isAutoSpacingPageExecuted = true; + var self = this; + var onceSpacingPage = once(function () { + self.spacingPage(); + }); + var videos = document.getElementsByTagName('video'); + + if (videos.length === 0) { + setTimeout(function () { + onceSpacingPage(); + }, pageDelay); + } else { + for (var i = 0; i < videos.length; i++) { + var video = videos[i]; + + if (video.readyState === 4) { + setTimeout(function () { + onceSpacingPage(); + }, 3000); + break; + } + + video.addEventListener('loadeddata', function () { + setTimeout(function () { + onceSpacingPage(); + }, 4000); + }); + } + } + + var queue = []; + var debouncedSpacingNodes = debounce(function () { + while (queue.length) { + var node = queue.shift(); + + if (node) { + self.spacingNode(node); + } + } + }, nodeDelay, { + 'maxWait': nodeMaxWait + }); + var mutationObserver = new MutationObserver(function (mutations, observer) { + mutations.forEach(function (mutation) { + switch (mutation.type) { + case 'childList': + mutation.addedNodes.forEach(function (node) { + if (node.nodeType === Node.ELEMENT_NODE) { + queue.push(node); + } else if (node.nodeType === Node.TEXT_NODE) { + queue.push(node.parentNode); + } + }); + break; + + case 'characterData': + var node = mutation.target; + + if (node.nodeType === Node.TEXT_NODE) { + queue.push(node.parentNode); + } + + break; + + default: + break; + } + }); + debouncedSpacingNodes(); + }); + mutationObserver.observe(document.body, { + characterData: true, + childList: true, + subtree: true + }); + } + }]); + + return BrowserPangu; + }(Pangu); + + var pangu = new BrowserPangu(); + module.exports = pangu; + module.exports["default"] = pangu; + module.exports.Pangu = BrowserPangu; +}); + +/***/ }), +/* 1 */ +/***/ (function(module, exports, __webpack_require__) { + +var __WEBPACK_AMD_DEFINE_FACTORY__, __WEBPACK_AMD_DEFINE_ARRAY__, __WEBPACK_AMD_DEFINE_RESULT__;(function (global, factory) { + if (true) { + !(__WEBPACK_AMD_DEFINE_ARRAY__ = [], __WEBPACK_AMD_DEFINE_FACTORY__ = (factory), + __WEBPACK_AMD_DEFINE_RESULT__ = (typeof __WEBPACK_AMD_DEFINE_FACTORY__ === 'function' ? + (__WEBPACK_AMD_DEFINE_FACTORY__.apply(exports, __WEBPACK_AMD_DEFINE_ARRAY__)) : __WEBPACK_AMD_DEFINE_FACTORY__), + __WEBPACK_AMD_DEFINE_RESULT__ !== undefined && (module.exports = __WEBPACK_AMD_DEFINE_RESULT__)); + } else { var mod; } +})(typeof globalThis !== "undefined" ? globalThis : typeof self !== "undefined" ? self : this, function () { + "use strict"; + + function _typeof(obj) { "@babel/helpers - typeof"; if (typeof Symbol === "function" && typeof Symbol.iterator === "symbol") { _typeof = function _typeof(obj) { return typeof obj; }; } else { _typeof = function _typeof(obj) { return obj && typeof Symbol === "function" && obj.constructor === Symbol && obj !== Symbol.prototype ? "symbol" : typeof obj; }; } return _typeof(obj); } + + function _classCallCheck(instance, Constructor) { if (!(instance instanceof Constructor)) { throw new TypeError("Cannot call a class as a function"); } } + + function _defineProperties(target, props) { for (var i = 0; i < props.length; i++) { var descriptor = props[i]; descriptor.enumerable = descriptor.enumerable || false; descriptor.configurable = true; if ("value" in descriptor) descriptor.writable = true; Object.defineProperty(target, descriptor.key, descriptor); } } + + function _createClass(Constructor, protoProps, staticProps) { if (protoProps) _defineProperties(Constructor.prototype, protoProps); if (staticProps) _defineProperties(Constructor, staticProps); return Constructor; } + + var CJK = "\u2E80-\u2EFF\u2F00-\u2FDF\u3040-\u309F\u30A0-\u30FA\u30FC-\u30FF\u3100-\u312F\u3200-\u32FF\u3400-\u4DBF\u4E00-\u9FFF\uF900-\uFAFF"; + var ANY_CJK = new RegExp("[".concat(CJK, "]")); + var SYMBOL_WIDE = '`~!@#$%*^&()/\\-+=<>?:"{}|,.;\'[\\]·~¥%——|\\\\'; + var SYMBOL = '`~!@#$%^&()/\\-+=<>?:"{}|,.;\'[\\]·~¥%——|\\\\'; + var SYMBOL_LEFT = '`~!@#$%^&(/\\-+=<>?:"{|,.;\'[·~¥%——|\\\\'; + var SYMBOL_RIGHT = '`~!@#$%^&)/\\-+=<>?:"}|,.;\'\\]·~¥%——|\\\\'; + var SYMBOL_SAFE = '`~!#$%^&/+=<>?:"|,;\'·~¥%——|\\\\'; + var ALPHA_CJK = new RegExp("([A-Za-z_])([".concat(CJK, "]+)"), 'g'); + var CJK_ALPHA = new RegExp("([".concat(CJK, "]+)([A-Za-z_])"), 'g'); + var NUMBER_CJK = new RegExp("([0-9_])([".concat(CJK, "]+)"), 'g'); + var CJK_NUMBER = new RegExp("([".concat(CJK, "]+)([0-9_])"), 'g'); + var CJK_AND_ALPHA = new RegExp("([".concat(CJK, "]+)(&)([A-Za-z_])"), 'g'); + var ALPHA_AND_CJK = new RegExp("([A-Za-z_])(&)([".concat(CJK, "]+)"), 'g'); + var ALPHA_SYMBOL_CJK = new RegExp("([A-Za-z_])([".concat(SYMBOL_RIGHT, "])([").concat(CJK, "])"), 'g'); + var CJK_SYMBOL_ALPHA = new RegExp("([".concat(CJK, "])([").concat(SYMBOL_LEFT, "])([A-Za-z_])"), 'g'); + var NUMBER_SYMBOL_CJK = new RegExp("([0-9_])([".concat(SYMBOL, "])([").concat(CJK, "])"), 'g'); + var CJK_SYMBOL_NUMBER = new RegExp("([".concat(CJK, "])([").concat(SYMBOL, "])([0-9_])"), 'g'); + var CJK_BRACKET = new RegExp("([".concat(CJK, "])([<\\[{\\(])"), 'g'); + var BRACKET_CJK = new RegExp("([>\\]\\)}])([".concat(CJK, "])"), 'g'); + var ALPHA_NUMBER_CJK = new RegExp("([A-Za-z_])([0-9_])([".concat(CJK, "])"), 'g'); + var CJK_SYMBOL_SYMBOL = new RegExp("([".concat(CJK, "])([").concat(SYMBOL_WIDE, "])([").concat(SYMBOL_WIDE, "])"), 'g'); + var SYMBOL_SYMBOL_CJK = new RegExp("([".concat(SYMBOL_WIDE, "])([").concat(SYMBOL_WIDE, "])([").concat(CJK, "])"), 'g'); + var CJK_SYMBOL_CJK_SYMBOL_CJK = new RegExp("([".concat(CJK, "])([").concat(SYMBOL_SAFE, "])([").concat(CJK, "])([").concat(SYMBOL_SAFE, "])([").concat(CJK, "])"), 'g'); + var CJK_SYMBOL_CJK = new RegExp("([".concat(CJK, "])([").concat(SYMBOL_SAFE, "])([").concat(CJK, "])"), 'g'); + var CJK_ACCOUNT_CJK = new RegExp("([".concat(CJK, "])(\\s*)(@[A-za-z0-9_]*)(\\s*)([").concat(CJK, "]+)(\\s*)([A-za-z0-9_]+)(\\s*)([").concat(CJK, "])")); + + var Pangu = function () { + function Pangu() { + _classCallCheck(this, Pangu); + + this.version = '1.0.0'; + } + + _createClass(Pangu, [{ + key: "spacing", + value: function spacing(text) { + if (typeof text !== 'string') { + console.warn("spacing(text) only accepts string but got ".concat(_typeof(text))); + return text; + } + + if (text.length <= 1 || !ANY_CJK.test(text)) { + return text; + } + + var self = this; + var newText = text; + newText = newText.replace(ALPHA_NUMBER_CJK, '$1$2 $3'); + newText = newText.replace(ALPHA_CJK, '$1 $2'); + newText = newText.replace(CJK_ALPHA, '$1 $2'); + newText = newText.replace(NUMBER_CJK, '$1 $2'); + newText = newText.replace(CJK_NUMBER, '$1 $2'); + newText = newText.replace(CJK_AND_ALPHA, '$1 $2 $3'); + newText = newText.replace(ALPHA_AND_CJK, '$1 $2 $3'); + newText = newText.replace(ALPHA_SYMBOL_CJK, '$1$2 $3'); + newText = newText.replace(CJK_SYMBOL_ALPHA, '$1 $2$3'); + newText = newText.replace(NUMBER_SYMBOL_CJK, '$1$2 $3'); + newText = newText.replace(CJK_SYMBOL_NUMBER, '$1 $2$3'); + newText = newText.replace(CJK_SYMBOL_SYMBOL, '$1 $2$3'); + newText = newText.replace(SYMBOL_SYMBOL_CJK, '$1$2 $3'); + newText = newText.replace(BRACKET_CJK, '$1 $2'); + newText = newText.replace(CJK_BRACKET, '$1 $2'); + newText = newText.replace(CJK_SYMBOL_CJK_SYMBOL_CJK, '$1 $2 $3 $4 $5'); + newText = newText.replace(CJK_SYMBOL_CJK, '$1 $2 $3'); + newText = newText.replace(CJK_ACCOUNT_CJK, '$1 $3$5$7 $9'); + return newText; + } + }, { + key: "spacingText", + value: function spacingText(text) { + var callback = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : function () {}; + var newText; + + try { + newText = this.spacing(text); + } catch (err) { + callback(err); + return; + } + + callback(null, newText); + } + }, { + key: "spacingTextSync", + value: function spacingTextSync(text) { + return this.spacing(text); + } + }]); + + return Pangu; + }(); + + var pangu = new Pangu(); + module.exports = pangu; + module.exports["default"] = pangu; + module.exports.Pangu = Pangu; +}); + +/***/ }) +/******/ ]); +}); \ No newline at end of file diff --git a/panguFilter.cabal b/panguFilter.cabal new file mode 100644 index 0000000..181782e --- /dev/null +++ b/panguFilter.cabal @@ -0,0 +1,109 @@ +cabal-version: 3.0 +-- The cabal-version field refers to the version of the .cabal specification, +-- and can be different from the cabal-install (the tool) version and the +-- Cabal (the library) version you are using. As such, the Cabal (the library) +-- version used must be equal or greater than the version stated in this field. +-- Starting from the specification version 2.2, the cabal-version field must be +-- the first thing in the cabal file. + +-- Initial package description 'panguFilter' generated by +-- 'cabal init'. For further documentation, see: +-- http://haskell.org/cabal/users-guide/ +-- +-- The name of the package. +name: panguFilter + +-- The package version. +-- See the Haskell package versioning policy (PVP) for standards +-- guiding when and how versions should be incremented. +-- https://pvp.haskell.org +-- PVP summary: +-+------- breaking API changes +-- | | +----- non-breaking API additions +-- | | | +--- code changes with no API change +version: 0.1.0.0 + +-- A short (one-line) description of the package. +-- synopsis: + +-- A longer description of the package. +-- description: + +-- The license under which the package is released. +license: BSD-3-Clause + +-- The file containing the license text. +license-file: LICENSE + +-- The package author(s). +author: Yu Cong + +-- An email address to which users can send suggestions, bug reports, and patches. +maintainer: sxlxcsxlxc@gmail.com + +-- A copyright notice. +-- copyright: +category: Text +build-type: Simple + +-- Extra doc files to be distributed with the package, such as a CHANGELOG or a README. +extra-doc-files: CHANGELOG.md + +-- Extra source files to be distributed with the package, such as examples, or a tutorial module. +-- extra-source-files: + +common warnings + ghc-options: -Wall + +library + -- Import common warning flags. + import: warnings + + -- Modules exported by the library. + exposed-modules: MyLib + + -- Modules included in this library but not exported. + -- other-modules: + + -- LANGUAGE extensions used by modules in this package. + -- other-extensions: + + -- Other library packages from which modules are imported. + build-depends: + base ^>=4.18.3.0, + text, + megaparsec, + replace-megaparsec + + -- Directories containing source files. + hs-source-dirs: src + + -- Base language which the package is written in. + default-language: Haskell2010 + +test-suite panguFilter-test + -- Import common warning flags. + import: warnings + + -- Base language which the package is written in. + default-language: Haskell2010 + + -- Modules included in this executable, other than Main. + -- other-modules: + + -- LANGUAGE extensions used by modules in this package. + -- other-extensions: + + -- The interface type and version of the test suite. + type: exitcode-stdio-1.0 + + -- Directories containing source files. + hs-source-dirs: test + + -- The entrypoint to the test suite. + main-is: Main.hs + + -- Test dependencies. + build-depends: + base ^>=4.18.3.0, + panguFilter, + hspec diff --git a/src/MyLib.hs b/src/MyLib.hs new file mode 100644 index 0000000..202df12 --- /dev/null +++ b/src/MyLib.hs @@ -0,0 +1,89 @@ +{-# LANGUAGE OverloadedStrings #-} + +module MyLib where + +import Data.Text (Text) +import qualified Data.Text as T +import Data.Void (Void) +import Replace.Megaparsec (streamEdit) +import Text.Megaparsec +import Text.Megaparsec.Char + +------------------------------------------------------------------------------- +type Parser = Parsec Void Text + +type Rule = Parser Text + +type RuleSet = [Rule] + +applyRules :: RuleSet -> Text -> Text +applyRules [] input = input +applyRules rules input = streamEdit (choice rules) id input + +-- TEST RULES +appleToOrange :: Rule +appleToOrange = "orange" <$ chunk "apple" + +emailAtRule :: Rule +emailAtRule = do + prefix <- some (alphaNumChar <|> oneOf ("._%+-" :: String)) + _ <- char '@' + suffix <- some (alphaNumChar <|> oneOf (".-" :: String)) + return $ T.pack prefix <> "[at]" <> T.pack suffix + +------------------------------------------------------------------------------- +-- rules for pangu + +-- | Check if a character falls within the CJK ranges provided +isCJK :: Char -> Bool +isCJK c = any (\(start, end) -> c >= start && c <= end) cjkRanges + where + cjkRanges = + [ ('\x2e80', '\x2eff'), + ('\x2f00', '\x2fdf'), + ('\x3040', '\x309f'), + ('\x30a0', '\x30fa'), + ('\x30fc', '\x30ff'), + ('\x3100', '\x312f'), + ('\x3200', '\x32ff'), + ('\x3400', '\x4dbf'), + ('\x4e00', '\x9fff'), + ('\xf900', '\xfaff') + ] + +convertToFullwidth :: Char -> Char +convertToFullwidth c = case c of + ':' -> ':' + '.' -> '。' + '~' -> '~' + '!' -> '!' + '?' -> '?' + ',' -> ',' + ';' -> ';' + _ -> c + +-- A parser that matches a single CJK character +cjkChar :: Parser Char +cjkChar = satisfy isCJK + +fullWidthSymbolRule :: Rule +fullWidthSymbolRule = do + c1 <- cjkChar -- First CJK + mid <- + some $ + choice -- The "middle" symbol part + [ char ' ', + char ':', + char '.' + ] + c2 <- cjkChar -- Second CJK + + -- In Haskell, we can actually process the 'mid' string logic here. + -- For now, let's assume we want to turn ":" into ":" and "." into "。" + let transformedMid = T.pack $ map convertToFullwidth mid + return $ T.singleton c1 <> transformedMid <> T.singleton c2 + + +-- the rule set +myRules :: RuleSet +myRules = [appleToOrange, emailAtRule, try fullWidthSymbolRule] \ No newline at end of file diff --git a/test/Main.hs b/test/Main.hs new file mode 100644 index 0000000..89752fe --- /dev/null +++ b/test/Main.hs @@ -0,0 +1,19 @@ +{-# LANGUAGE OverloadedStrings #-} +module Main (main) where +import MyLib +import Test.Hspec + + +main :: IO () +main = hspec $ do + describe "MyLib.mapemail" $ do + it "maps @ to [at] in emails" $ do + applyRules myRules "aaa@a.com" `shouldBe` "aaa[at]a.com" + + describe "MyLib.mapfruits" $ do + it "maps apple to orange" $ do + applyRules myRules "apple" `shouldBe` "orange" + + describe "MyLib.fullWidthSymbolRule" $ do + it "你:好" $ do + applyRules myRules "你:好" `shouldBe` "你:好" \ No newline at end of file