From 158fc7bbc8d100fa2d8a0fdb032cdf52055b4a9c Mon Sep 17 00:00:00 2001 From: "weiye.wang" Date: Mon, 9 Jan 2023 23:06:22 +0800 Subject: [PATCH] 20230109 night --- 文本处理工具/剪贴板文本整理_mathsnip.ipynb | 366 +++++++++++++++++++++ 1 file changed, 366 insertions(+) create mode 100644 文本处理工具/剪贴板文本整理_mathsnip.ipynb diff --git a/文本处理工具/剪贴板文本整理_mathsnip.ipynb b/文本处理工具/剪贴板文本整理_mathsnip.ipynb new file mode 100644 index 00000000..446435a4 --- /dev/null +++ b/文本处理工具/剪贴板文本整理_mathsnip.ipynb @@ -0,0 +1,366 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os,re\n", + "import win32clipboard as wc\n", + "import win32con\n", + "\n", + "# 获取剪切板内容\n", + "def getCopy():\n", + " wc.OpenClipboard()\n", + " t = wc.GetClipboardData(win32con.CF_UNICODETEXT)\n", + " wc.CloseClipboard()\n", + " return t\n", + "\n", + "# 写入剪切板内容\n", + "def setCopy(str):\n", + " wc.OpenClipboard()\n", + " wc.EmptyClipboard()\n", + " wc.SetClipboardData(win32con.CF_UNICODETEXT, str)\n", + " wc.CloseClipboard()\n", + "\n", + "def full_stop(matchobj):\n", + " if matchobj.group(1) == \"。\" or matchobj.group(1) == \".\":\n", + " return \". \"\n", + " else:\n", + " return \".\\n\"\n", + "def refine_brackets(matchobj):\n", + " return matchobj.group(1)[1:-1]\n", + "def insert_a_blank(matchobj):\n", + " return matchobj.group(1)[:-1]+\" \"+matchobj.group(1)[-1]\n", + "def multiple_choice(matchobj):\n", + " string = \"\\\\fourch\" + \"{\" + matchobj.group(1) + \"}{\" + matchobj.group(2) + \"}{\" + matchobj.group(3) + \"}{\" + matchobj.group(4) + \"}\\n\"\n", + " return string\n", + "def boldsymbols(matchobj):\n", + " return \"\\\\i\"+matchobj.group(1)[:-1]+\"\\\\mathbf{\"+matchobj.group(1)[-1]+\"}\"\n", + "def boldsymbols_star(matchobj):\n", + " return \"\\\\in \\\\mathbf{\"+matchobj.group(1)+\"}^*\"\n", + "def singleboldsymbols(matchobj):\n", + " return \"$\\\\mathbf{\" + matchobj.group(1) + \"}$\"\n", + "def blackboardbold(matchobj):\n", + " string = \"\\\\mathbf\" + \"{\" + matchobj.group(1) + \"}\"\n", + " return string\n", + "def limit(matchobj):\n", + " return \"\\\\displaystyle\\\\lim_{\"+matchobj.group(1)+\"}\"\n", + "def replace_i(matchobj):\n", + " string = matchobj.group(1)\n", + " length = len(string)\n", + " for i in range(length-1,-1,-1):\n", + " if string[i] == \"i\" and not \"item\" in string[i:] and not \"overline\" in string[i:]:\n", + " string = string[:i] + \"\\\\mathrm{i}\" + string[i+1:]\n", + " return string\n", + "def refine_log(matchobj):\n", + " return r\"\\log_\"+matchobj.group(1)\n", + "def refine_powers(matchobj):\n", + " base = matchobj.group(1)\n", + " power = matchobj.group(2)\n", + " return base + \"^\" + power\n", + "def refine_sequences(matchobj):\n", + " return \"\\{\" + matchobj.group(1) + \"\\}\"\n", + "def refine_starting_brackets(matchobj):\n", + " return \"$\" + matchobj.group(1)\n", + "def refine_left_operating_brackets(matchobj):\n", + " obj = matchobj.group(2)\n", + " return matchobj.group(1)+obj\n", + "def refine_right_operating_brackets(matchobj):\n", + " obj = matchobj.group(1)\n", + " return obj + matchobj.group(2)\n", + "def refine_brackets_in_brackets(matchobj):\n", + " return matchobj.group(1) + matchobj.group(2) + matchobj.group(3)\n", + "def mathbf(matchobj):\n", + " return \"\\\\mathbf{\" + matchobj.group(1) + \"}^\" + matchobj.group(2)\n", + "#以上是202207之前的文本处理机制\n", + "global layer\n", + "def rename_bracket(matchobj):\n", + " return \"leftbracket\" + str(layer) + matchobj.group(1) + \"rightbracket\" + str(layer)\n", + "def frac_brackets(matchobj):\n", + " return \"frac{\"+matchobj.group(1)+\"}{\"+matchobj.group(2)+\"}\"\n", + "def frac_single_second_bracket(matchobj):\n", + " return \"frac \"+matchobj.group(1)+\"{\"+matchobj.group(2)+\"}\"\n", + "def recall_vital_bracket(matchobj):\n", + " return matchobj.group(1) + \"{\" + matchobj.group(2) + \"}\"\n", + "def sqrt_brackets(matchobj):\n", + " if matchobj.group(1) == None:\n", + " first_group = \"\"\n", + " else:\n", + " first_group = matchobj.group(1)\n", + " return \"sqrt \"+ first_group +\"{\" + matchobj.group(2) + \"}\"\n", + "#def refine_frac(string):\n", + "# for s in range(7):\n", + "# for t in range(7):\n", + "# string = re.sub(r\"frac[\\s]*leftbracket\"+str(s)+\"(.*?)\"+r\"rightbracket\"+str(s)+\"[\\s]*\"+r\"leftbracket\"+str(t)+\"(.*?)\"+r\"rightbracket\"+str(t),frac_brackets,string)\n", + "# return string\n", + "def refine_single_second_frac(string):\n", + " for s in range(7):\n", + " string = re.sub(r\"frac[\\s]*(\\w)[\\s]*leftbracket\"+str(s)+\"(.*?)\"+r\"rightbracket\"+str(s),frac_single_second_bracket,string)\n", + " return string\n", + "def refine_vital_bracket(string):\n", + " for s in range(7):\n", + " string = re.sub(r\"(frac)[\\s]*leftbracket\"+str(s)+\"(.*?)rightbracket\"+str(s),recall_vital_bracket,string)\n", + " string = re.sub(r\"(line)[\\s]*leftbracket\"+str(s)+\"(.*?)rightbracket\"+str(s),recall_vital_bracket,string)\n", + " string = re.sub(r\"(arrow)[\\s]*leftbracket\"+str(s)+\"(.*?)rightbracket\"+str(s),recall_vital_bracket,string)\n", + " string = re.sub(r\"(_)[\\s]*leftbracket\"+str(s)+\"(.*?)rightbracket\"+str(s),recall_vital_bracket,string)\n", + " string = re.sub(r\"(\\^)[\\s]*leftbracket\"+str(s)+\"(.*?)rightbracket\"+str(s),recall_vital_bracket,string)\n", + " string = re.sub(r\"(mathrm)[\\s]*leftbracket\"+str(s)+\"(.*?)rightbracket\"+str(s),recall_vital_bracket,string)\n", + " string = re.sub(r\"(mathbf)[\\s]*leftbracket\"+str(s)+\"(.*?)rightbracket\"+str(s),recall_vital_bracket,string)\n", + " string = re.sub(r\"(begin)[\\s]*leftbracket\"+str(s)+\"(.*?)rightbracket\"+str(s),recall_vital_bracket,string)\n", + " string = re.sub(r\"(end)[\\s]*leftbracket\"+str(s)+\"(.*?)rightbracket\"+str(s),recall_vital_bracket,string)\n", + " return string\n", + "def refine_sqrt(string):\n", + " for s in range(7):\n", + " string = re.sub(r\"sqrt[\\s]*(\\[\\w*\\])*[\\s]*leftbracket\"+str(s)+\"(.*?)rightbracket\"+str(s),sqrt_brackets,string)\n", + " return string\n", + "def give_blanks(string):\n", + " string = re.sub(r\"(sqrt[\\w])\",insert_a_blank,string)\n", + " string = re.sub(r\"(frac[\\w])\",insert_a_blank,string)\n", + " return string\n", + "def give_brackets(string):\n", + " string = re.sub(r\"leftbracket\\d\",\"\",string)\n", + " string = re.sub(r\"rightbracket\\d\",\"\",string)\n", + " string = re.sub(r\"leftset\",r\"\\{\",string)\n", + " string = re.sub(r\"rightset\",r\"\\}\",string)\n", + " return string\n", + "#以上是20220715新加的文本处理机制\n", + "def initial_bracket_search(string):\n", + " t = re.search(r\"^[\\s]*?leftbracket(\\d)\",string)\n", + " if t == None:\n", + " return -1\n", + " else:\n", + " return t.span()[1]\n", + "def initial_brackets_pair_search(string,d):\n", + " t = re.search(\"rightbracket\"+d,string)\n", + " if t == None:\n", + " return -1\n", + " else:\n", + " return t.span()[1]\n", + "def refine_frac(string):\n", + " eq_left = \"\"\n", + " eq_right = string\n", + " while re.search(\"frac\",eq_right) != None:\n", + " pos = re.search(\"frac\",eq_right)\n", + " eq_left += eq_right[:pos.span()[1]]\n", + " eq_right = eq_right[pos.span()[1]:]\n", + " if initial_bracket_search(eq_right)>0:\n", + " pos = initial_brackets_pair_search(eq_right,eq_right[initial_bracket_search(eq_right)-1])\n", + " first_bracket = eq_right[:pos]\n", + " first_layer = first_bracket[-1]\n", + " eq_remain = eq_right[pos:]\n", + " if initial_bracket_search(eq_remain)>0:\n", + " pos = initial_brackets_pair_search(eq_remain,eq_remain[initial_bracket_search(eq_remain)-1])\n", + " second_bracket = eq_remain[:pos]\n", + " second_layer = second_bracket[-1]\n", + " first_bracket = re.sub(r\"leftbracket\"+first_layer,\"{\",first_bracket)\n", + " second_bracket = re.sub(r\"leftbracket\"+second_layer,\"{\",second_bracket)\n", + " first_bracket = re.sub(r\"rightbracket\"+first_layer,\"}\",first_bracket)\n", + " second_bracket = re.sub(r\"rightbracket\"+second_layer,\"}\",second_bracket)\n", + " eq_right = first_bracket+second_bracket+eq_remain[pos:]\n", + " return eq_left+eq_right\n", + "#以上是20220718修改的大括号处理机制, 修复了一个bug\n", + "def reduce_blank(matchobj):\n", + " return matchobj.group(1).replace(\" \",\"\")\n", + "def add_dollars(matchobj):\n", + " return matchobj.group(1)[0] + r\"$\" + matchobj.group(1)[1:-1] + r\"$\" + matchobj.group(1)[-1]\n", + "def del_first_char(matchobj):\n", + " return matchobj.group(1)[1:]\n", + "def add_underline(matchobj):\n", + " return matchobj.group(1)[0] + \"_\" + matchobj.group(1)[-1]\n", + "def brackets_to_cwords(matchobj):\n", + " return \"左括号\"+matchobj.group(1)+\"右括号\"\n", + "def cwords_to_brackets(matchobj):\n", + " return \"(\"+matchobj.group(1)+\")\"\n", + "def circled_brackets(matchobj):\n", + " return matchobj.group(1)[:-1]+\"{\"+matchobj.group(1)[-1] + \"}\"\n", + "\n", + "# try:\n", + "# os.chdir(r\"D:\\mathdept\\mathdept\\文本处理程序等\")\n", + "# except:\n", + "# os.chdir(r\"D:\\mathdept\\文本处理程序等\")\n", + "# with open(\"textfile.txt\", \"r\", encoding = \"utf8\") as textfile:\n", + "# data = textfile.read()\n", + "\n", + "data = getCopy()\n", + "\n", + "\n", + "#去除左右括号的前缀\n", + "data = data.replace(r\"\\rightarrow\",r\"\\to\")\n", + "data = data.replace(r\"\\left.\",\"\").replace(r\"\\left\",\"\").replace(r\"\\right.\",\"\").replace(r\"\\right\",\"\")\n", + "\n", + "#全角半角符号替换\n", + "data = re.sub(\" \",\" \",data)\n", + "data = re.sub(\"(。[\\n]*)\",full_stop,data)\n", + "data = re.sub(\"(.[\\n]*)\",full_stop,data)\n", + "data = re.sub(\",\",\", \",data)\n", + "data = re.sub(\":\",\": \",data)\n", + "data = re.sub(\";\",\"; \",data)\n", + "data = re.sub(\"(\",\"(\",data)\n", + "data = re.sub(\")\",\")\",data)\n", + "data = re.sub(\"?\",\"? \",data)\n", + "data = re.sub(\"“\",\"``\",data)\n", + "data = re.sub(\"”\",\"''\",data)\n", + "data = re.sub(\" ``\",\"``\",data)\n", + "data = re.sub(\"'' \",\"''\",data)\n", + "\n", + "#替换题号\n", + "data = re.sub(\"(^[例]*[0-9]+\\.[\\s]+)\",\"\\\\n\\\\\\\\item \",data)\n", + "data = re.sub(\"(\\\\n[例]*[0-9]+\\.[\\s]+)\",\"\\\\n\\\\\\\\item \",data)\n", + "\n", + "#公式标志换成$符号\n", + "data = re.sub(\"\\\\\\\\\\[\",r\"$\",data)\n", + "data = re.sub(\"\\\\\\\\\\]\",r\"$\",data)\n", + "data = re.sub(\"\\$\\$\",\"\",data)\n", + "\n", + "#选择题替换成标准格式\n", + "data = re.sub(\"A\\.([\\s\\S]*?)B\\.([\\s\\S]*?)C\\.([\\s\\S]*?)D\\.([\\s\\S]*?)\\\\n\",multiple_choice,data)\n", + "data = re.sub(\"\\(A\\)([\\s\\S]*?)\\(B\\)([\\s\\S]*?)\\(C\\)([\\s\\S]*?)\\(D\\)([\\s\\S]*?)\\\\n\",multiple_choice,data)\n", + "data = re.sub(\"A\\.([\\s\\S]*?)B\\.([\\s\\S]*?)C\\.([\\s\\S]*?)D\\.([\\s\\S]*?)\\\\n\",multiple_choice,data)\n", + "data = re.sub(\"\\(A\\)([\\s\\S]*?)\\(B\\)([\\s\\S]*?)\\(C\\)([\\s\\S]*?)\\(D\\)([\\s\\S]*?)\\\\n\",multiple_choice,data)\n", + "data = re.sub(\"\\$[ ]+\\}\",\"$}\",data)\n", + "data = re.sub(\"\\{[ ]+\\$\",\"{$\",data)\n", + "\n", + "#替换多余的空行\n", + "for i in range(20):\n", + " data = re.sub(\"\\n[\\t ]*\\n\",\"\\n\",data)\n", + "\n", + "data1 = data #替换后暂存data1\n", + "\n", + "#分离文字和公式\n", + "raw_texts = [] #文字数组\n", + "raw_equations = [] #公式数组\n", + "d = data\n", + "while len(d) > 0:\n", + " interval = re.search(r\"\\$[\\s\\S]*?\\$\",d)\n", + " if not interval == None:\n", + " (start, end) = interval.span()\n", + " raw_texts.append(d[:start])\n", + " raw_equations.append(d[start:end])\n", + " d = d[end:]\n", + " else:\n", + " raw_texts.append(d)\n", + " d = \"\"\n", + "#至此已经分离了文字和公式,公式在两个$之内,包含两个$\n", + "\n", + "modified_texts = []\n", + "modified_equations = []\n", + "\n", + "for text in raw_texts:\n", + " text1 = text\n", + " #删除选项中无用的空格\n", + " text1 = re.sub(\"\\{[\\s]+?\",\"{\",text1)\n", + " text1 = re.sub(\"[\\s]+?\\}\",\"}\",text1)\n", + " #填空题的处理\n", + " text1 = re.sub(\"[ _]{2,}\",r\"\\\\blank{50}\",text1)\n", + " #选择题的处理\n", + " text1 = re.sub(r\"\\(\\\\blank\\{50\\}\\)\",\"\\\\\\\\bracket{20}\",text1)\n", + " text1 = re.sub(r\"\\([\\s]{1,10}\\)\",\"\\\\\\\\bracket{20}\",text1)\n", + " #逗号后面加空格\n", + " text1 = re.sub(\",[ ]*\",\", \",text1)\n", + " text1 = re.sub(r\"\\.\\}\",\"}\",text1)\n", + " text1 = re.sub(r\"\\n\\d{1,3}\\.\",r\"\\n\\\\item \",text1)\n", + " text1 = re.sub(r\"\\s{2,}\\.\",r\"\\\\blank{50}.\",text1)\n", + " text1 = re.sub(r\"\\s{2,}\\,\",r\"\\\\blank{50},\",text1)\n", + " text1 = re.sub(r\"\\\\bracket\\{20\\}\\n\",r\"\\\\bracket{20}.\\n\",text1)\n", + " modified_texts.append(text1)\n", + "\n", + "for equation in raw_equations:\n", + " equation1 = equation\n", + " modified_equations.append(equation1)\n", + "\n", + "\n", + "#整合修改过的文本和公式 \n", + "modified_data = \"\"\n", + "for i in range(len(modified_texts)):\n", + " try:\n", + " modified_data += modified_texts[i]\n", + " except:\n", + " a = 1\n", + " try:\n", + " modified_data += modified_equations[i]\n", + " except:\n", + " a = 1\n", + "modified_data = re.sub(r\"[ ]+\\n\",\"\\n\",modified_data)\n", + "modified_data = re.sub(r\"\\$[\\s]*?\\\\parallel[\\s]*?\\$\",r\"\\\\parallel\",modified_data)\n", + "modified_data = re.sub(r\"\\n例\\s*?\\d{1,3}\\s*\",r\"\\n\\\\item \",modified_data)\n", + "modified_data = re.sub(r\"(\\$[\\,\\.:;]\\$)\",refine_brackets,modified_data)\n", + "\n", + "\n", + "#以下是mathpix之后的空格去除\n", + "for i in range(3):\n", + " modified_data = re.sub(r\"([\\u4e00-\\u9fa5])( )([\\u4e00-\\u9fa5])\",lambda x:x.group(1)+x.group(3),modified_data)\n", + " modified_data = re.sub(r\"\\$ \",\"$\",modified_data)\n", + " modified_data = re.sub(r\" \\$\",\"$\",modified_data)\n", + "#mathpix的错别字修改\n", + "modified_data = modified_data.replace(\"雉\",\"锥\")\n", + "modified_data = modified_data.replace(\"粗圆\",\"椭圆\")\n", + "modified_data = modified_data.replace(\"针角\",\"钝角\")\n", + "#mathpix的自由向量修改\n", + "modified_data = modified_data.replace(r\"\\vec\",r\"\\overrightarrow \")\n", + "modified_data = modified_data.replace(r\"\\bar\",r\"\\overline \")\n", + "#mathpix的极限修改\n", + "modified_data = modified_data.replace(r\"\\lim _{n \\rightarrow \\infty}\",r\"\\displaystyle\\lim_{n\\to\\infty}\")\n", + "#mathpix的顿号修改\n", + "modified_data = modified_data.replace(r\" 、 \",r\"$、$\")\n", + "#改slant等\n", + "modified_data = modified_data.replace(r\"slant\",\"\")\n", + "modified_data = modified_data.replace(r\"\\mid\",\"|\")\n", + "modified_data = re.sub(r\"\\\\mathrm\\{\\\\mathrm\\{i\\}\\}\",r\"\\\\mathrm{i}\",modified_data)\n", + "modified_data = modified_data.replace(\",$\",\", $\")\n", + "modified_data = modified_data.replace(\" / /\",r\"\\parallel\")\n", + "modified_data = modified_data.replace(\"mathrmR\",r\"mathbf{R}\")\n", + "modified_data = modified_data.replace(r\"^{\\prime}\",\"'\")\n", + "modified_data = re.sub(r\"\\^\\{\\\\dfrac\",r\"^{\\\\frac\",modified_data)\n", + "modified_data = re.sub(r\"\\^\\{-\\\\dfrac\",r\"^{-\\\\frac\",modified_data)\n", + "modified_data = re.sub(r\"_\\{\\\\dfrac\",r\"^{_{\\\\frac\",modified_data)\n", + "modified_data = re.sub(r\"_\\{-\\\\dfrac\",r\"^{_{-\\\\frac\",modified_data)\n", + "\n", + "#改分段函数等\n", + "modified_data = re.sub(r\"\\\\begin\\{array\\}[rcl]*\",r\"\\\\begin{cases}\",modified_data)\n", + "modified_data = re.sub(r\"\\\\end{array}\",r\"\\\\end{cases}\",modified_data)\n", + "\n", + "\n", + "setCopy(modified_data)\n", + "\n", + "with open(\"临时文件/outputfile.txt\",\"w\",encoding = \"utf8\") as f:\n", + " f.write(modified_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "mathdept", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.15" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "ff3c292c316ba85de6f1ad75f19c731e79d694e741b6f515ec18f14996fe48dc" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}