因为是5万条数据,所以首先要将这些数据判断出属于哪个模板。首先提取文本中前10个字符并groupby,高于一定阈值(比如1600条),就表明是同一个模板,需要保留。
再将剩下的数据,提取文本最后5个字符并groupby,提取出剩下的模板。
import pandas as pd
import re
from tqdm import tqdm
#批量获取地址
index_list = []
text_list = []
for i in tqdm(range(1, 50001)):
html_number = str(i) + '.html'
file = open('决赛/试题/' + html_number, encoding='utf-8-sig').read()
text = re.compile(r'<[^>]+>').sub('', file)
text = text.replace('地址识别试题', '').replace('\u3000', '').replace('\t', '')
text = text.strip()
index_list.append(html_number)
text_list.append(text)
output_dict = {'index':index_list, 'text':text_list}
text_output = pd.DataFrame(output_dict)
text_output['head'] = text_output['text'].apply(lambda x:x[:10])
text_output['tail'] = text_output['text'].apply(lambda x:x[-5:])
head_pivot = text_output.pivot_table(index='head', values='text', aggfunc='count')
head_list = head_pivot[head_pivot['text']>1600].index.to_list() #这里的1600需要根据实际情况调整
head_mode_dict = {}
for i, ele in enumerate(head_list):
head_mode_dict[ele] = i+1
text_output['head_mode_type'] = text_output['head'].apply(lambda x: head_mode_dict[x] if x in head_list else '')
text_tail = text_output[text_output['head_mode_type']=='']
tail_list = text_tail.pivot_table(index='tail', values='text', aggfunc='count').index.to_list()
tail_mode_dict = {}
for i in range(len(tail_list)):
tail_mode_dict[tail_list[i]] = len(head_mode_dict)+i+1
text_output['tail_mode_type'] = text_output[text_output['head_mode_type']=='']['tail'].apply(lambda x:tail_mode_dict[x] if x in tail_list else '')
def get_mode_type(df):
'''获得模板序号'''
if df['head_mode_type']!='':
return df['head_mode_type']
else:
return df['tail_mode_type']
text_output['mode_type'] = text_output.apply(get_mode_type, axis=1)
因为是html文本,主要存在两种样式:
1)、与普通文本类似,包含了逗号、分号、感叹号等,此类文本可以通过re.split(r'[,,。!!;;]', texta)
来分割;
2)、还有一种是表格填充式的,不包含逗号等分割符,只有大量空格,所以需要个性化处理,通过空格来分割;
3)、分割的好处是:可以将两个文本的句子两两比对,如果相同则表明是纯公共部分,可以直接输出;如果有不同,则可以输出到part2,进一步提取公共因子
def get_html_list(text):
'''获得只有空格相隔的html文件的list'''
temp_text_list = text.split(' ')
new_text_list = []
for ele in temp_text_list:
if ele != '':
new_text_list.append(ele)
return new_text_list
a_list = re.split(r'[,,。!!;;]', texta)
b_list = re.split(r'[,,。!!;;]', textb)
if len(a_list) == 1 and len(b_list) == 1:
a_list = get_html_list(texta)
b_list = get_html_list(textb)
对于此类情况,我们只需要将2个文本进行逐字比对,直到找到不一样的内容即可。
所以针对上述上述三种情况,要分别根据前向匹配、后向匹配的方式提取出before_text和after_text两类文本
“我的家在上海市”中的before_text是“我的家在上海市”,after_text是空
wrong_list = ['区', '号', '道', '街', '州', '县','市','路']
and_list = ['和', '或', '、', '改', '请到']
min_length = min(len(line_a), len(line_b))
before_text, middle_text, after_text = '', '', ''
if line_a.startswith(line_b):
before_text = line_b
elif line_a.endswith(line_b):
after_text = line_b
elif line_b.startswith(line_a):
before_text = line_a
elif line_b.endswith(line_a):
after_text = line_a
else:
for i in range(1, min_length):
if line_a[:i] == line_b[:i] and not line_a[i-1].isdigit():
if line_a[i] in wrong_list and line_a[i:i+2] != '市场':
continue
else:
before_text = line_a[:i]
for j in range(min_length, 0, -1):
if line_a[j-min_length:] == line_b[j-min_length:] and not line_a[j-min_length].isdigit():
if line_a[j-min_length] in wrong_list and line_a[j-min_length:j-min_length+2] != '市场':
continue
else:
after_text = line_a[j-min_length:]
new_line_a = line_a.replace(before_text, '').replace(after_text, '')
new_line_b = line_b.replace(before_text, '').replace(after_text, '')
#这边开始提取middle_text
if (new_line_a == new_line_b) or not new_line_a or not new_line_b:
pass
elif new_line_a[0] in new_line_b:
for k in range(1, len(new_line_a)+1):
if new_line_a[:k] in new_line_b:
if k==1 and new_line_a[:k] in and_list:
middle_text = new_line_a[:k]
elif k>1:
middle_text = new_line_a[:k]
else:
break
elif new_line_b[0] in new_line_a:
for k in range(1, len(new_line_b)+1):
if new_line_b[:k] in new_line_a:
if k==1 and new_line_b[:k] in and_list:
middle_text = new_line_b[:k]
elif k>1:
middle_text = new_line_b[:k]
else:
break
elif new_line_a[-1] in new_line_b:
for k in range(len(new_line_a),0,-1):
if new_line_a[k-len(new_line_a):] in new_line_b and new_line_a[k-len(new_line_a)] not in wrong_list and not new_line_a[k-len(new_line_a)].isdigit():
middle_text = new_line_a[k-len(new_line_a):]
else:
break
elif new_line_b[-1] in new_line_a:
for k in range(len(new_line_b),0,-1):
if new_line_b[k-len(new_line_b):] in new_line_a and new_line_b[k-len(new_line_b)] not in wrong_list and not new_line_b[k-len(new_line_b)].isdigit():
middle_text = new_line_b[k-len(new_line_b):]
else:
break
else:
for ele in and_list:
if ele in new_line_a and ele in new_line_b:
middle_text = ele
break
#有种特别复杂的三因素情况
if len(middle_text) > 1 and middle_text[-1] in and_list:
new_line_a2 = new_line_a[(new_line_a.find(middle_text)+len(middle_text)):]
new_line_b2 = new_line_b[len(middle_text):]
temp_f, temp_m, temp_e = get_two_text(new_line_a2, new_line_b2)
if temp_f == '' and temp_e == '' and temp_m != '':
middle_text = middle_text + '(.*?)' + temp_m
上面代码中的函数get_two_text()就是整个函数本身,此处做了递归。
import pandas as pd
import re
from tqdm import tqdm
#第一步:批量获取地址
index_list = []
text_list = []
for i in tqdm(range(1, 50001)):
html_number = str(i) + '.html'
file = open('决赛/试题/' + html_number, encoding='utf-8-sig').read()
text = re.compile(r'<[^>]+>').sub('', file)
text = text.replace('地址识别试题', '').replace('\u3000', '').replace('\t', '')
text = text.strip()
index_list.append(html_number)
text_list.append(text)
output_dict = {'index':index_list, 'text':text_list}
text_output = pd.DataFrame(output_dict)
#第二步:找出30种模式并分类
text_output['head'] = text_output['text'].apply(lambda x:x[:10])
text_output['tail'] = text_output['text'].apply(lambda x:x[-5:])
head_pivot = text_output.pivot_table(index='head', values='text', aggfunc='count')
head_list = head_pivot[head_pivot['text']>1600].index.to_list() #这里的1600需要根据实际情况调整
head_mode_dict = {}
for i, ele in enumerate(head_list):
head_mode_dict[ele] = i+1
text_output['head_mode_type'] = text_output['head'].apply(lambda x: head_mode_dict[x] if x in head_list else '')
text_tail = text_output[text_output['head_mode_type']=='']
tail_list = text_tail.pivot_table(index='tail', values='text', aggfunc='count').index.to_list()
tail_mode_dict = {}
for i in range(len(tail_list)):
tail_mode_dict[tail_list[i]] = len(head_mode_dict)+i+1
text_output['tail_mode_type'] = text_output[text_output['head_mode_type']=='']['tail'].apply(lambda x:tail_mode_dict[x] if x in tail_list else '')
def get_mode_type(df):
'''获得模板序号'''
if df['head_mode_type']!='':
return df['head_mode_type']
else:
return df['tail_mode_type']
text_output['mode_type'] = text_output.apply(get_mode_type, axis=1)
#第三步:寻找公共因子
def get_two_text(line_a, line_b):
'''获取两个文本之前和之后共同的部分'''
wrong_list = ['区', '号', '道', '街', '州', '县','市','路']
and_list = ['和', '或', '、', '改', '请到']
min_length = min(len(line_a), len(line_b))
before_text, middle_text, after_text = '', '', ''
if line_a.startswith(line_b):
before_text = line_b
elif line_a.endswith(line_b):
after_text = line_b
elif line_b.startswith(line_a):
before_text = line_a
elif line_b.endswith(line_a):
after_text = line_a
else:
for i in range(1, min_length):
if line_a[:i] == line_b[:i] and not line_a[i-1].isdigit():
if line_a[i] in wrong_list and line_a[i:i+2] != '市场':
continue
else:
before_text = line_a[:i]
for j in range(min_length, 0, -1):
if line_a[j-min_length:] == line_b[j-min_length:] and not line_a[j-min_length].isdigit():
if line_a[j-min_length] in wrong_list and line_a[j-min_length:j-min_length+2] != '市场':
continue
else:
after_text = line_a[j-min_length:]
new_line_a = line_a.replace(before_text, '').replace(after_text, '')
new_line_b = line_b.replace(before_text, '').replace(after_text, '')
#这边开始提取middle_text
if (new_line_a == new_line_b) or not new_line_a or not new_line_b:
pass
elif new_line_a[0] in new_line_b:
for k in range(1, len(new_line_a)+1):
if new_line_a[:k] in new_line_b:
if k==1 and new_line_a[:k] in and_list:
middle_text = new_line_a[:k]
elif k>1:
middle_text = new_line_a[:k]
else:
break
elif new_line_b[0] in new_line_a:
for k in range(1, len(new_line_b)+1):
if new_line_b[:k] in new_line_a:
if k==1 and new_line_b[:k] in and_list:
middle_text = new_line_b[:k]
elif k>1:
middle_text = new_line_b[:k]
else:
break
#有种特别复杂的三因素情况
if len(middle_text) > 1 and middle_text[-1] in and_list:
new_line_a2 = new_line_a[(new_line_a.find(middle_text)+len(middle_text)):]
new_line_b2 = new_line_b[len(middle_text):]
temp_f, temp_m, temp_e = get_two_text(new_line_a2, new_line_b2)
if temp_f == '' and temp_e == '' and temp_m != '':
middle_text = middle_text + '(.*?)' + temp_m
elif new_line_a[-1] in new_line_b:
for k in range(len(new_line_a),0,-1):
if new_line_a[k-len(new_line_a):] in new_line_b and new_line_a[k-len(new_line_a)] not in wrong_list and not new_line_a[k-len(new_line_a)].isdigit():
middle_text = new_line_a[k-len(new_line_a):]
else:
break
elif new_line_b[-1] in new_line_a:
for k in range(len(new_line_b),0,-1):
if new_line_b[k-len(new_line_b):] in new_line_a and new_line_b[k-len(new_line_b)] not in wrong_list and not new_line_b[k-len(new_line_b)].isdigit():
middle_text = new_line_b[k-len(new_line_b):]
else:
break
else:
for ele in and_list:
if ele in new_line_a and ele in new_line_b:
middle_text = ele
break
return before_text, middle_text, after_text
def list_in_text(text):
'''判断text中是否有list的元素'''
and_list = ['和', '或', '、', '改', '请到']
for ele in and_list:
if ele in text:
return True
else:
return False
def get_html_list(text):
'''获得只有空格相隔的html文件的list'''
temp_text_list = text.split(' ')
new_text_list = []
for ele in temp_text_list:
if ele != '':
new_text_list.append(ele)
return new_text_list
def get_compile(texta, textb):
'''获得正则匹配的模式'''
a_list = re.split(r'[,,。!!;;]', texta)
b_list = re.split(r'[,,。!!;;]', textb)
if len(a_list) == 1 and len(b_list) == 1:
a_list = get_html_list(texta)
b_list = get_html_list(textb)
re_list = []
for i in range(len(a_list)):
if a_list[i] == b_list[i]:
re_list.append(a_list[i])
continue
else:
before_text, middle_text, after_text = get_two_text(a_list[i], b_list[i])
if middle_text == '':
re_compile = before_text + '(.*?)'+after_text
else:
re_compile = before_text + '(.*?)'+middle_text + '(.*?)'+ after_text
re_list.append(re_compile)
return re_list
def get_mode_compile(mode_num):
'''获得某个模式的正确正则'''
this_mode_df = text_output[text_output['mode_type']==mode_num].reset_index(drop=True)
for i in range(1, this_mode_df.shape[0]):
texta = this_mode_df.iloc[0,1]
textb = this_mode_df.iloc[i,1]
if i==1:
this_mode_compile = get_compile(texta, textb)
else:
new_mode_compile = get_compile(texta, textb)
for j in range(len(this_mode_compile)):
if '(.*?)' in this_mode_compile[j]:
if new_mode_compile[j].count('(.*?)') > this_mode_compile[j].count('(.*?)') and list_in_text(new_mode_compile[j]):
this_mode_compile[j] = new_mode_compile[j]
elif '(.*?)' in new_mode_compile[j] and (new_mode_compile[j].count('(.*?)') == this_mode_compile[j].count('(.*?)')) and len(new_mode_compile[j]) < len(this_mode_compile[j]):
this_mode_compile[j] = new_mode_compile[j]
elif '(.*?)' in new_mode_compile[j]:
this_mode_compile[j] = new_mode_compile[j]
else:
continue
return this_mode_compile
#获取每一个模板的正则化模式
mode_count = len(text_output['mode_type'].value_counts().index)
mode_dict = {}
for mode_num in range(1, mode_count+1):
mode_dict[mode_num] = get_mode_compile(mode_num)
text_output['mode'] = text_output['mode_type'].apply(lambda x:mode_dict[x])
text_output[['index', 'text', 'mode', 'mode_type']].to_csv('5万条数据(含模式匹配).csv',encoding='gbk', index=False)