910 lines
19 KiB
Plaintext
910 lines
19 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"旧题目数: 0 , 新题目数: 18100\n",
|
|
"开始新题与旧题的比对\n",
|
|
"50\n",
|
|
"100\n",
|
|
"150\n",
|
|
"200\n",
|
|
"250\n",
|
|
"300\n",
|
|
"350\n",
|
|
"400\n",
|
|
"450\n",
|
|
"500\n",
|
|
"550\n",
|
|
"600\n",
|
|
"650\n",
|
|
"700\n",
|
|
"750\n",
|
|
"800\n",
|
|
"850\n",
|
|
"900\n",
|
|
"950\n",
|
|
"1000\n",
|
|
"1050\n",
|
|
"1100\n",
|
|
"1150\n",
|
|
"1200\n",
|
|
"1250\n",
|
|
"1300\n",
|
|
"1350\n",
|
|
"1400\n",
|
|
"1450\n",
|
|
"1500\n",
|
|
"1550\n",
|
|
"1600\n",
|
|
"1650\n",
|
|
"1700\n",
|
|
"1750\n",
|
|
"1800\n",
|
|
"1850\n",
|
|
"1900\n",
|
|
"1950\n",
|
|
"2000\n",
|
|
"2050\n",
|
|
"2100\n",
|
|
"2150\n",
|
|
"2200\n",
|
|
"2250\n",
|
|
"2300\n",
|
|
"2350\n",
|
|
"2400\n",
|
|
"2450\n",
|
|
"2500\n",
|
|
"2550\n",
|
|
"2600\n",
|
|
"2650\n",
|
|
"2700\n",
|
|
"2750\n",
|
|
"2800\n",
|
|
"2850\n",
|
|
"2900\n",
|
|
"2950\n",
|
|
"3000\n",
|
|
"3050\n",
|
|
"3100\n",
|
|
"3150\n",
|
|
"3200\n",
|
|
"3250\n",
|
|
"3300\n",
|
|
"3350\n",
|
|
"3400\n",
|
|
"3450\n",
|
|
"3500\n",
|
|
"3550\n",
|
|
"3600\n",
|
|
"3650\n",
|
|
"3700\n",
|
|
"3750\n",
|
|
"3800\n",
|
|
"3850\n",
|
|
"3900\n",
|
|
"3950\n",
|
|
"4000\n",
|
|
"4050\n",
|
|
"4100\n",
|
|
"4150\n",
|
|
"4200\n",
|
|
"4250\n",
|
|
"4300\n",
|
|
"4350\n",
|
|
"4400\n",
|
|
"4450\n",
|
|
"4500\n",
|
|
"4550\n",
|
|
"4600\n",
|
|
"4650\n",
|
|
"4700\n",
|
|
"4750\n",
|
|
"4800\n",
|
|
"4850\n",
|
|
"4900\n",
|
|
"4950\n",
|
|
"5000\n",
|
|
"5050\n",
|
|
"5100\n",
|
|
"5150\n",
|
|
"5200\n",
|
|
"5250\n",
|
|
"5300\n",
|
|
"5350\n",
|
|
"5400\n",
|
|
"5450\n",
|
|
"5500\n",
|
|
"5550\n",
|
|
"5600\n",
|
|
"5650\n",
|
|
"5700\n",
|
|
"5750\n",
|
|
"5800\n",
|
|
"5850\n",
|
|
"5900\n",
|
|
"5950\n",
|
|
"6000\n",
|
|
"6050\n",
|
|
"6100\n",
|
|
"6150\n",
|
|
"6200\n",
|
|
"6250\n",
|
|
"6300\n",
|
|
"6350\n",
|
|
"6400\n",
|
|
"6450\n",
|
|
"6500\n",
|
|
"6550\n",
|
|
"6600\n",
|
|
"6650\n",
|
|
"6700\n",
|
|
"6750\n",
|
|
"6800\n",
|
|
"6850\n",
|
|
"6900\n",
|
|
"6950\n",
|
|
"7000\n",
|
|
"7050\n",
|
|
"7100\n",
|
|
"7150\n",
|
|
"7200\n",
|
|
"7250\n",
|
|
"7300\n",
|
|
"7350\n",
|
|
"7400\n",
|
|
"7450\n",
|
|
"7500\n",
|
|
"7550\n",
|
|
"7600\n",
|
|
"7650\n",
|
|
"7700\n",
|
|
"7750\n",
|
|
"7800\n",
|
|
"7850\n",
|
|
"7900\n",
|
|
"7950\n",
|
|
"8000\n",
|
|
"8050\n",
|
|
"8100\n",
|
|
"8150\n",
|
|
"8200\n",
|
|
"8250\n",
|
|
"8300\n",
|
|
"8350\n",
|
|
"8400\n",
|
|
"8450\n",
|
|
"8500\n",
|
|
"8550\n",
|
|
"8600\n",
|
|
"8650\n",
|
|
"8700\n",
|
|
"8750\n",
|
|
"8800\n",
|
|
"8850\n",
|
|
"8900\n",
|
|
"8950\n",
|
|
"9000\n",
|
|
"9050\n",
|
|
"9100\n",
|
|
"9150\n",
|
|
"9200\n",
|
|
"9250\n",
|
|
"9300\n",
|
|
"9350\n",
|
|
"9400\n",
|
|
"9450\n",
|
|
"9500\n",
|
|
"9550\n",
|
|
"9600\n",
|
|
"9650\n",
|
|
"9700\n",
|
|
"9750\n",
|
|
"9800\n",
|
|
"9850\n",
|
|
"9900\n",
|
|
"9950\n",
|
|
"10000\n",
|
|
"10050\n",
|
|
"10100\n",
|
|
"10150\n",
|
|
"10200\n",
|
|
"10250\n",
|
|
"10300\n",
|
|
"10350\n",
|
|
"10400\n",
|
|
"10450\n",
|
|
"10500\n",
|
|
"10550\n",
|
|
"10600\n",
|
|
"10650\n",
|
|
"10700\n",
|
|
"10750\n",
|
|
"10800\n",
|
|
"10850\n",
|
|
"10900\n",
|
|
"10950\n",
|
|
"11000\n",
|
|
"11050\n",
|
|
"11100\n",
|
|
"11150\n",
|
|
"11200\n",
|
|
"11250\n",
|
|
"11300\n",
|
|
"11350\n",
|
|
"11400\n",
|
|
"11450\n",
|
|
"11500\n",
|
|
"11550\n",
|
|
"11600\n",
|
|
"11650\n",
|
|
"11700\n",
|
|
"11750\n",
|
|
"11800\n",
|
|
"11850\n",
|
|
"11900\n",
|
|
"11950\n",
|
|
"12000\n",
|
|
"12050\n",
|
|
"12100\n",
|
|
"12150\n",
|
|
"12200\n",
|
|
"12250\n",
|
|
"12300\n",
|
|
"12350\n",
|
|
"12400\n",
|
|
"12450\n",
|
|
"12500\n",
|
|
"12550\n",
|
|
"12600\n",
|
|
"12650\n",
|
|
"12700\n",
|
|
"12750\n",
|
|
"12800\n",
|
|
"12850\n",
|
|
"12900\n",
|
|
"12950\n",
|
|
"13000\n",
|
|
"13050\n",
|
|
"13100\n",
|
|
"13150\n",
|
|
"13200\n",
|
|
"13250\n",
|
|
"13300\n",
|
|
"13350\n",
|
|
"13400\n",
|
|
"13450\n",
|
|
"13500\n",
|
|
"13550\n",
|
|
"13600\n",
|
|
"13650\n",
|
|
"13700\n",
|
|
"13750\n",
|
|
"13800\n",
|
|
"13850\n",
|
|
"13900\n",
|
|
"13950\n",
|
|
"14000\n",
|
|
"14050\n",
|
|
"14100\n",
|
|
"14150\n",
|
|
"14200\n",
|
|
"14250\n",
|
|
"14300\n",
|
|
"14350\n",
|
|
"14400\n",
|
|
"14450\n",
|
|
"14500\n",
|
|
"14550\n",
|
|
"14600\n",
|
|
"14650\n",
|
|
"14700\n",
|
|
"14750\n",
|
|
"14800\n",
|
|
"14850\n",
|
|
"14900\n",
|
|
"14950\n",
|
|
"15000\n",
|
|
"15050\n",
|
|
"15100\n",
|
|
"15150\n",
|
|
"15200\n",
|
|
"15250\n",
|
|
"15300\n",
|
|
"15350\n",
|
|
"15400\n",
|
|
"15450\n",
|
|
"15500\n",
|
|
"15550\n",
|
|
"15600\n",
|
|
"15650\n",
|
|
"15700\n",
|
|
"15750\n",
|
|
"15800\n",
|
|
"15850\n",
|
|
"15900\n",
|
|
"15950\n",
|
|
"16000\n",
|
|
"16050\n",
|
|
"16100\n",
|
|
"16150\n",
|
|
"16200\n",
|
|
"16250\n",
|
|
"16300\n",
|
|
"16350\n",
|
|
"16400\n",
|
|
"16450\n",
|
|
"16500\n",
|
|
"16550\n",
|
|
"16600\n",
|
|
"16650\n",
|
|
"16700\n",
|
|
"16750\n",
|
|
"16800\n",
|
|
"16850\n",
|
|
"16900\n",
|
|
"16950\n",
|
|
"17000\n",
|
|
"17050\n",
|
|
"17100\n",
|
|
"17150\n",
|
|
"17200\n",
|
|
"17250\n",
|
|
"17300\n",
|
|
"17350\n",
|
|
"17400\n",
|
|
"17450\n",
|
|
"17500\n",
|
|
"17550\n",
|
|
"17600\n",
|
|
"17650\n",
|
|
"17700\n",
|
|
"17750\n",
|
|
"17800\n",
|
|
"17850\n",
|
|
"17900\n",
|
|
"17950\n",
|
|
"18000\n",
|
|
"18050\n",
|
|
"18100\n",
|
|
"开始新题之间的比对\n",
|
|
"50\n",
|
|
"100\n",
|
|
"150\n",
|
|
"200\n",
|
|
"250\n",
|
|
"300\n",
|
|
"350\n",
|
|
"400\n",
|
|
"450\n",
|
|
"500\n",
|
|
"550\n",
|
|
"600\n",
|
|
"650\n",
|
|
"700\n",
|
|
"750\n",
|
|
"800\n",
|
|
"850\n",
|
|
"900\n",
|
|
"950\n",
|
|
"1000\n",
|
|
"1050\n",
|
|
"1100\n",
|
|
"1150\n",
|
|
"1200\n",
|
|
"1250\n",
|
|
"1300\n",
|
|
"1350\n",
|
|
"1400\n",
|
|
"1450\n",
|
|
"1500\n",
|
|
"1550\n",
|
|
"1600\n",
|
|
"1650\n",
|
|
"1700\n",
|
|
"1750\n",
|
|
"1800\n",
|
|
"1850\n",
|
|
"1900\n",
|
|
"1950\n",
|
|
"2000\n",
|
|
"2050\n",
|
|
"2100\n",
|
|
"2150\n",
|
|
"2200\n",
|
|
"2250\n",
|
|
"2300\n",
|
|
"2350\n",
|
|
"2400\n",
|
|
"2450\n",
|
|
"2500\n",
|
|
"2550\n",
|
|
"2600\n",
|
|
"2650\n",
|
|
"2700\n",
|
|
"2750\n",
|
|
"2800\n",
|
|
"2850\n",
|
|
"2900\n",
|
|
"2950\n",
|
|
"3000\n",
|
|
"3050\n",
|
|
"3100\n",
|
|
"3150\n",
|
|
"3200\n",
|
|
"3250\n",
|
|
"3300\n",
|
|
"3350\n",
|
|
"3400\n",
|
|
"3450\n",
|
|
"3500\n",
|
|
"3550\n",
|
|
"3600\n",
|
|
"3650\n",
|
|
"3700\n",
|
|
"3750\n",
|
|
"3800\n",
|
|
"3850\n",
|
|
"3900\n",
|
|
"3950\n",
|
|
"4000\n",
|
|
"4050\n",
|
|
"4100\n",
|
|
"4150\n",
|
|
"4200\n",
|
|
"4250\n",
|
|
"4300\n",
|
|
"4350\n",
|
|
"4400\n",
|
|
"4450\n",
|
|
"4500\n",
|
|
"4550\n",
|
|
"4600\n",
|
|
"4650\n",
|
|
"4700\n",
|
|
"4750\n",
|
|
"4800\n",
|
|
"4850\n",
|
|
"4900\n",
|
|
"4950\n",
|
|
"5000\n",
|
|
"5050\n",
|
|
"5100\n",
|
|
"5150\n",
|
|
"5200\n",
|
|
"5250\n",
|
|
"5300\n",
|
|
"5350\n",
|
|
"5400\n",
|
|
"5450\n",
|
|
"5500\n",
|
|
"5550\n",
|
|
"5600\n",
|
|
"5650\n",
|
|
"5700\n",
|
|
"5750\n",
|
|
"5800\n",
|
|
"5850\n",
|
|
"5900\n",
|
|
"5950\n",
|
|
"6000\n",
|
|
"6050\n",
|
|
"6100\n",
|
|
"6150\n",
|
|
"6200\n",
|
|
"6250\n",
|
|
"6300\n",
|
|
"6350\n",
|
|
"6400\n",
|
|
"6450\n",
|
|
"6500\n",
|
|
"6550\n",
|
|
"6600\n",
|
|
"6650\n",
|
|
"6700\n",
|
|
"6750\n",
|
|
"6800\n",
|
|
"6850\n",
|
|
"6900\n",
|
|
"6950\n",
|
|
"7000\n",
|
|
"7050\n",
|
|
"7100\n",
|
|
"7150\n",
|
|
"7200\n",
|
|
"7250\n",
|
|
"7300\n",
|
|
"7350\n",
|
|
"7400\n",
|
|
"7450\n",
|
|
"7500\n",
|
|
"7550\n",
|
|
"7600\n",
|
|
"7650\n",
|
|
"7700\n",
|
|
"7750\n",
|
|
"7800\n",
|
|
"7850\n",
|
|
"7900\n",
|
|
"7950\n",
|
|
"8000\n",
|
|
"8050\n",
|
|
"8100\n",
|
|
"8150\n",
|
|
"8200\n",
|
|
"8250\n",
|
|
"8300\n",
|
|
"8350\n",
|
|
"8400\n",
|
|
"8450\n",
|
|
"8500\n",
|
|
"8550\n",
|
|
"8600\n",
|
|
"8650\n",
|
|
"8700\n",
|
|
"8750\n",
|
|
"8800\n",
|
|
"8850\n",
|
|
"8900\n",
|
|
"8950\n",
|
|
"9000\n",
|
|
"9050\n",
|
|
"9100\n",
|
|
"9150\n",
|
|
"9200\n",
|
|
"9250\n",
|
|
"9300\n",
|
|
"9350\n",
|
|
"9400\n",
|
|
"9450\n",
|
|
"9500\n",
|
|
"9550\n",
|
|
"9600\n",
|
|
"9650\n",
|
|
"9700\n",
|
|
"9750\n",
|
|
"9800\n",
|
|
"9850\n",
|
|
"9900\n",
|
|
"9950\n",
|
|
"10000\n",
|
|
"10050\n",
|
|
"10100\n",
|
|
"10150\n",
|
|
"10200\n",
|
|
"10250\n",
|
|
"10300\n",
|
|
"10350\n",
|
|
"10400\n",
|
|
"10450\n",
|
|
"10500\n",
|
|
"10550\n",
|
|
"10600\n",
|
|
"10650\n",
|
|
"10700\n",
|
|
"10750\n",
|
|
"10800\n",
|
|
"10850\n",
|
|
"10900\n",
|
|
"10950\n",
|
|
"11000\n",
|
|
"11050\n",
|
|
"11100\n",
|
|
"11150\n",
|
|
"11200\n",
|
|
"11250\n",
|
|
"11300\n",
|
|
"11350\n",
|
|
"11400\n",
|
|
"11450\n",
|
|
"11500\n",
|
|
"11550\n",
|
|
"11600\n",
|
|
"11650\n",
|
|
"11700\n",
|
|
"11750\n",
|
|
"11800\n",
|
|
"11850\n",
|
|
"11900\n",
|
|
"11950\n",
|
|
"12000\n",
|
|
"12050\n",
|
|
"12100\n",
|
|
"12150\n",
|
|
"12200\n",
|
|
"12250\n",
|
|
"12300\n",
|
|
"12350\n",
|
|
"12400\n",
|
|
"12450\n",
|
|
"12500\n",
|
|
"12550\n",
|
|
"12600\n",
|
|
"12650\n",
|
|
"12700\n",
|
|
"12750\n",
|
|
"12800\n",
|
|
"12850\n",
|
|
"12900\n",
|
|
"12950\n",
|
|
"13000\n",
|
|
"13050\n",
|
|
"13100\n",
|
|
"13150\n",
|
|
"13200\n",
|
|
"13250\n",
|
|
"13300\n",
|
|
"13350\n",
|
|
"13400\n",
|
|
"13450\n",
|
|
"13500\n",
|
|
"13550\n",
|
|
"13600\n",
|
|
"13650\n",
|
|
"13700\n",
|
|
"13750\n",
|
|
"13800\n",
|
|
"13850\n",
|
|
"13900\n",
|
|
"13950\n",
|
|
"14000\n",
|
|
"14050\n",
|
|
"14100\n",
|
|
"14150\n",
|
|
"14200\n",
|
|
"14250\n",
|
|
"14300\n",
|
|
"14350\n",
|
|
"14400\n",
|
|
"14450\n",
|
|
"14500\n",
|
|
"14550\n",
|
|
"14600\n",
|
|
"14650\n",
|
|
"14700\n",
|
|
"14750\n",
|
|
"14800\n",
|
|
"14850\n",
|
|
"14900\n",
|
|
"14950\n",
|
|
"15000\n",
|
|
"15050\n",
|
|
"15100\n",
|
|
"15150\n",
|
|
"15200\n",
|
|
"15250\n",
|
|
"15300\n",
|
|
"15350\n",
|
|
"15400\n",
|
|
"15450\n",
|
|
"15500\n",
|
|
"15550\n",
|
|
"15600\n",
|
|
"15650\n",
|
|
"15700\n",
|
|
"15750\n",
|
|
"15800\n",
|
|
"15850\n",
|
|
"15900\n",
|
|
"15950\n",
|
|
"16000\n",
|
|
"16050\n",
|
|
"16100\n",
|
|
"16150\n",
|
|
"16200\n",
|
|
"16250\n",
|
|
"16300\n",
|
|
"16350\n",
|
|
"16400\n",
|
|
"16450\n",
|
|
"16500\n",
|
|
"16550\n",
|
|
"16600\n",
|
|
"16650\n",
|
|
"16700\n",
|
|
"16750\n",
|
|
"16800\n",
|
|
"16850\n",
|
|
"16900\n",
|
|
"16950\n",
|
|
"17000\n",
|
|
"17050\n",
|
|
"17100\n",
|
|
"17150\n",
|
|
"17200\n",
|
|
"17250\n",
|
|
"17300\n",
|
|
"17350\n",
|
|
"17400\n",
|
|
"17450\n",
|
|
"17500\n",
|
|
"17550\n",
|
|
"17600\n",
|
|
"17650\n",
|
|
"17700\n",
|
|
"17750\n",
|
|
"17800\n",
|
|
"17850\n",
|
|
"17900\n",
|
|
"17950\n",
|
|
"18000\n",
|
|
"18050\n",
|
|
"总耗时: 384.6457269191742 秒.\n",
|
|
"发现相似: 2800 , 其中已标注: 2223 .\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# from hashlib import new\n",
|
|
"import os,re,difflib,Levenshtein,time,json\n",
|
|
"\n",
|
|
"# 重要!!! 新题目的范围\n",
|
|
"id_new_problems = \"1:50000\"\n",
|
|
"threshold = 0.99\n",
|
|
"\n",
|
|
"#生成数码列表, 逗号分隔每个区块, 区块内部用:表示整数闭区间\n",
|
|
"def generate_number_set(string):\n",
|
|
" string = re.sub(r\"[\\n\\s]\",\"\",string)\n",
|
|
" string_list = string.split(\",\")\n",
|
|
" numbers_list = []\n",
|
|
" for s in string_list:\n",
|
|
" if not \":\" in s:\n",
|
|
" numbers_list.append(s.zfill(6))\n",
|
|
" else:\n",
|
|
" start,end = s.split(\":\")\n",
|
|
" for ind in range(int(start),int(end)+1):\n",
|
|
" numbers_list.append(str(ind).zfill(6))\n",
|
|
" return numbers_list\n",
|
|
"\n",
|
|
"#字符串预处理\n",
|
|
"def pre_treating(string):\n",
|
|
" string = re.sub(r\"\\\\begin\\{center\\}[\\s\\S]*?\\\\end\\{center\\}\",\"\",string)\n",
|
|
" string = re.sub(r\"(bracket\\{\\d+\\})|(blank\\{\\d+\\})|(fourch)|(twoch)|(onech)\",\"\",string)\n",
|
|
" string = re.sub(r\"[\\s\\\\\\{\\}\\$\\(\\)\\[\\]]\",\"\",string)\n",
|
|
" string = re.sub(r\"[\\n\\t]\",\"\",string)\n",
|
|
" string = re.sub(r\"(displaystyle)|(overrightarrow)\",\"\",string)\n",
|
|
" string = re.sub(r\"[,\\.:;?]\",\"\",string)\n",
|
|
" return string\n",
|
|
"\n",
|
|
"#difflab字符串比较\n",
|
|
"def difflab_get_equal_rate(str1, str2):\n",
|
|
" # str1 = pre_treating(str1)\n",
|
|
" # str2 = pre_treating(str2)\n",
|
|
" return difflib.SequenceMatcher(None, str1, str2).ratio()\n",
|
|
"\n",
|
|
"#Levenshtein jaro字符串比较\n",
|
|
"def jaro_get_equal_rate(str1,str2):\n",
|
|
" # str1 = pre_treating(str1)\n",
|
|
" # str2 = pre_treating(str2)\n",
|
|
" return Levenshtein.jaro(str1,str2)\n",
|
|
"\n",
|
|
"#Levenshtein 字符串比较\n",
|
|
"def Lev_get_equal_rate(str1,str2):\n",
|
|
" # str1 = pre_treating(str1)\n",
|
|
" # str2 = pre_treating(str2)\n",
|
|
" return Levenshtein.ratio(str1,str2)\n",
|
|
"\n",
|
|
"\n",
|
|
"\n",
|
|
"\n",
|
|
"#指定对比方法\n",
|
|
"sim_test = jaro_get_equal_rate\n",
|
|
"\n",
|
|
"#读入题库\n",
|
|
"with open(r\"../题库0.3/Problems.json\",\"r\",encoding = \"utf8\") as f:\n",
|
|
" database = f.read()\n",
|
|
"pro_dict = json.loads(database)\n",
|
|
"\n",
|
|
"#生成旧题目数据库字典与新题目数据库字典\n",
|
|
"new_id_list_raw = generate_number_set(id_new_problems)\n",
|
|
"new_id_list = [id for id in pro_dict if id in new_id_list_raw]\n",
|
|
"old_problems_dict = {}\n",
|
|
"new_problems_dict = {}\n",
|
|
"old_problems_dict_content = {}\n",
|
|
"new_problems_dict_content = {}\n",
|
|
"for id in pro_dict:\n",
|
|
" if id in new_id_list:\n",
|
|
" new_problems_dict[id] = pro_dict[id]\n",
|
|
" new_problems_dict_content[id] = pre_treating(pro_dict[id][\"content\"])\n",
|
|
" else:\n",
|
|
" old_problems_dict[id] = pro_dict[id]\n",
|
|
" old_problems_dict_content[id] = pre_treating(pro_dict[id][\"content\"])\n",
|
|
"print(\"旧题目数:\",len(old_problems_dict),\", 新题目数:\",len(new_problems_dict))\n",
|
|
"\n",
|
|
"#记录起始时间\n",
|
|
"start_time = time.time()\n",
|
|
"suspect_count = 0\n",
|
|
"remarked = 0\n",
|
|
"\n",
|
|
"alike_problems = \"\"\n",
|
|
"\n",
|
|
"\n",
|
|
"\n",
|
|
"#开始新题与旧题的比对\n",
|
|
"count = 0\n",
|
|
"print(\"开始新题与旧题的比对\")\n",
|
|
"for id_new in new_problems_dict:\n",
|
|
" count += 1\n",
|
|
" if count % 50 == 0:\n",
|
|
" print(count)\n",
|
|
" for id_old in old_problems_dict:\n",
|
|
" similar_rate = sim_test(new_problems_dict_content[id_new],old_problems_dict_content[id_old])\n",
|
|
" if similar_rate > threshold or id_new in old_problems_dict[id_old][\"related\"] or id_new in old_problems_dict[id_old][\"same\"] or id_old in new_problems_dict[id_new][\"related\"] or id_old in new_problems_dict[id_new][\"same\"]:\n",
|
|
" suspect_count += 1\n",
|
|
" if not (id_new in old_problems_dict[id_old][\"related\"] or id_new in old_problems_dict[id_old][\"same\"] or id_old in new_problems_dict[id_new][\"related\"] or id_old in new_problems_dict[id_new][\"same\"]):\n",
|
|
" alike_problems += (\"%.4f\" %similar_rate) + \"\\n\\n\" + id_new + \" \" + new_problems_dict[id_new][\"content\"] + \"\\n\\n\" + id_old + \" \" + old_problems_dict[id_old][\"content\"] + \"\\n\\n\"\n",
|
|
" else:\n",
|
|
" remarked += 1\n",
|
|
"\n",
|
|
"#开始新题之间的比对\n",
|
|
"count = 0\n",
|
|
"print(\"开始新题之间的比对\")\n",
|
|
"while len(new_problems_dict) >= 2:\n",
|
|
" count += 1\n",
|
|
" if count % 50 == 0:\n",
|
|
" print(count)\n",
|
|
" keys = list(new_problems_dict.keys())\n",
|
|
" current_problem = new_problems_dict.pop(keys[0])\n",
|
|
" current_problem_content = new_problems_dict_content[current_problem[\"id\"]]\n",
|
|
" for id_new in new_problems_dict:\n",
|
|
" similar_rate = sim_test(new_problems_dict_content[id_new],current_problem_content)\n",
|
|
" if similar_rate > threshold or id_new in current_problem[\"related\"] or id_new in current_problem[\"same\"] or current_problem[\"id\"] in new_problems_dict[id_new][\"related\"] or current_problem[\"id\"] in new_problems_dict[id_new][\"same\"]:\n",
|
|
" suspect_count += 1\n",
|
|
" if not (id_new in current_problem[\"related\"] or id_new in current_problem[\"same\"] or current_problem[\"id\"] in new_problems_dict[id_new][\"related\"] or current_problem[\"id\"] in new_problems_dict[id_new][\"same\"]):\n",
|
|
" alike_problems += (\"%.4f\" %similar_rate) + \"\\n\\n\" + id_new + \" \" + new_problems_dict[id_new][\"content\"] + \"\\n\\n\" + current_problem[\"id\"] + \" \" + current_problem[\"content\"] + \"\\n\\n\"\n",
|
|
" else:\n",
|
|
" remarked += 1\n",
|
|
"\n",
|
|
"\n",
|
|
"#记录终止时间及显示结果\n",
|
|
"end_time = time.time()\n",
|
|
"print(\"总耗时:\",end_time-start_time,\"秒.\")\n",
|
|
"print(\"发现相似: \",suspect_count,\", 其中已标注: \",remarked,\".\")\n",
|
|
"\n",
|
|
"with open(\"临时文件/相似题目.txt\",\"w\",encoding=\"utf8\") as f:\n",
|
|
" f.write(alike_problems)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "mathdept",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.9.15"
|
|
},
|
|
"orig_nbformat": 4,
|
|
"vscode": {
|
|
"interpreter": {
|
|
"hash": "ff3c292c316ba85de6f1ad75f19c731e79d694e741b6f515ec18f14996fe48dc"
|
|
}
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|