您的当前位置:首页正文

Python:批量提取文本中的共性部分并转化为正则表达式

2024-11-07 来源:个人技术集锦

前言

转换思路

1、模板分类

因为是5万条数据,所以首先要将这些数据判断出属于哪个模板。首先提取文本中前10个字符并groupby,高于一定阈值(比如1600条),就表明是同一个模板,需要保留。
再将剩下的数据,提取文本最后5个字符并groupby,提取出剩下的模板。

代码:
import pandas as pd
import re
from tqdm import tqdm

#批量获取地址
index_list = []
text_list = []

for i in tqdm(range(1, 50001)):
    html_number = str(i) + '.html'
    file = open('决赛/试题/' + html_number, encoding='utf-8-sig').read()
    text = re.compile(r'<[^>]+>').sub('', file)
    text = text.replace('地址识别试题', '').replace('\u3000', '').replace('\t', '')
    text = text.strip()
    index_list.append(html_number)
    text_list.append(text)

output_dict = {'index':index_list, 'text':text_list}
text_output = pd.DataFrame(output_dict)

text_output['head'] = text_output['text'].apply(lambda x:x[:10])
text_output['tail'] = text_output['text'].apply(lambda x:x[-5:])

head_pivot = text_output.pivot_table(index='head', values='text', aggfunc='count')
head_list = head_pivot[head_pivot['text']>1600].index.to_list() #这里的1600需要根据实际情况调整

head_mode_dict = {}
for i, ele in enumerate(head_list):
    head_mode_dict[ele] = i+1
    
text_output['head_mode_type'] = text_output['head'].apply(lambda x: head_mode_dict[x] if x in head_list else '')
text_tail = text_output[text_output['head_mode_type']=='']
tail_list = text_tail.pivot_table(index='tail', values='text', aggfunc='count').index.to_list()

tail_mode_dict = {}
for i in range(len(tail_list)):
    tail_mode_dict[tail_list[i]] = len(head_mode_dict)+i+1

text_output['tail_mode_type'] = text_output[text_output['head_mode_type']=='']['tail'].apply(lambda x:tail_mode_dict[x] if x in tail_list else '')

def get_mode_type(df):
    '''获得模板序号'''
    if df['head_mode_type']!='':
        return df['head_mode_type']
    else:
        return df['tail_mode_type']

text_output['mode_type'] = text_output.apply(get_mode_type, axis=1)
2、寻找公共因子
part 1:文本分割

因为是html文本,主要存在两种样式:
1)、与普通文本类似,包含了逗号、分号、感叹号等,此类文本可以通过re.split(r'[,,。!!;;]', texta)来分割;
2)、还有一种是表格填充式的,不包含逗号等分割符,只有大量空格,所以需要个性化处理,通过空格来分割;
3)、分割的好处是:可以将两个文本的句子两两比对,如果相同则表明是纯公共部分,可以直接输出;如果有不同,则可以输出到part2,进一步提取公共因子

def get_html_list(text):
    '''获得只有空格相隔的html文件的list'''
    temp_text_list = text.split(' ')
    new_text_list = []
    for ele in temp_text_list:
        if ele != '':
            new_text_list.append(ele)
    return new_text_list

a_list = re.split(r'[,,。!!;;]', texta)
b_list = re.split(r'[,,。!!;;]', textb)
if len(a_list) == 1 and len(b_list) == 1:
    a_list = get_html_list(texta)
    b_list = get_html_list(textb)
part 2:句子间的公共部分提取
1)、单个地址

对于此类情况,我们只需要将2个文本进行逐字比对,直到找到不一样的内容即可。

所以针对上述上述三种情况,要分别根据前向匹配、后向匹配的方式提取出before_text和after_text两类文本
“我的家在上海市”中的before_text是“我的家在上海市”,after_text是空

wrong_list = ['区', '号', '道', '街', '州', '县','市','路']
and_list = ['和', '或', '、', '改', '请到']
min_length = min(len(line_a), len(line_b))
before_text, middle_text, after_text = '', '', ''
if line_a.startswith(line_b):
    before_text = line_b
elif line_a.endswith(line_b):
    after_text = line_b
elif line_b.startswith(line_a):
    before_text = line_a
elif line_b.endswith(line_a):
    after_text = line_a
else:
    for i in range(1, min_length):
        if line_a[:i] == line_b[:i] and not line_a[i-1].isdigit():
            if line_a[i] in wrong_list and line_a[i:i+2] != '市场':
                continue
            else:
                before_text = line_a[:i]
    for j in range(min_length, 0, -1):
        if line_a[j-min_length:] == line_b[j-min_length:] and not line_a[j-min_length].isdigit():
            if line_a[j-min_length] in wrong_list and line_a[j-min_length:j-min_length+2] != '市场':   
                continue
            else:
                after_text = line_a[j-min_length:]
    new_line_a = line_a.replace(before_text, '').replace(after_text, '')
    new_line_b = line_b.replace(before_text, '').replace(after_text, '')
2)、两个地址
#这边开始提取middle_text
if (new_line_a == new_line_b) or not new_line_a or not new_line_b:
    pass
elif new_line_a[0] in new_line_b:
    for k in range(1, len(new_line_a)+1):
        if new_line_a[:k] in new_line_b:
            if k==1 and new_line_a[:k] in and_list:
                middle_text = new_line_a[:k]
            elif k>1:
                middle_text = new_line_a[:k]
        else:
            break
elif new_line_b[0] in new_line_a:
    for k in range(1, len(new_line_b)+1):
        if new_line_b[:k] in new_line_a:
            if k==1 and new_line_b[:k] in and_list:
                middle_text = new_line_b[:k]
            elif k>1:
                middle_text = new_line_b[:k]
        else:
            break

elif new_line_a[-1] in new_line_b:
    for k in range(len(new_line_a),0,-1):
        if new_line_a[k-len(new_line_a):] in new_line_b and new_line_a[k-len(new_line_a)] not in wrong_list and not new_line_a[k-len(new_line_a)].isdigit():
            middle_text = new_line_a[k-len(new_line_a):]
        else:
            break
elif new_line_b[-1] in new_line_a:
    for k in range(len(new_line_b),0,-1):
        if new_line_b[k-len(new_line_b):] in new_line_a and new_line_b[k-len(new_line_b)] not in wrong_list and not new_line_b[k-len(new_line_b)].isdigit():
            middle_text = new_line_b[k-len(new_line_b):]
        else:
            break
else:
    for ele in and_list:
        if ele in new_line_a and ele in new_line_b:
            middle_text = ele
            break
3)、三个地址
#有种特别复杂的三因素情况
if len(middle_text) > 1 and middle_text[-1] in and_list:
    new_line_a2 = new_line_a[(new_line_a.find(middle_text)+len(middle_text)):]
    new_line_b2 = new_line_b[len(middle_text):]
    temp_f, temp_m, temp_e = get_two_text(new_line_a2, new_line_b2)
    if temp_f == '' and temp_e == '' and temp_m != '':
        middle_text = middle_text + '(.*?)' + temp_m

上面代码中的函数get_two_text()就是整个函数本身,此处做了递归。

part 3:模式之间的判断

全部代码

import pandas as pd
import re
from tqdm import tqdm

#第一步:批量获取地址
index_list = []
text_list = []

for i in tqdm(range(1, 50001)):
    html_number = str(i) + '.html'
    file = open('决赛/试题/' + html_number, encoding='utf-8-sig').read()
    text = re.compile(r'<[^>]+>').sub('', file)
    text = text.replace('地址识别试题', '').replace('\u3000', '').replace('\t', '')
    text = text.strip()
    index_list.append(html_number)
    text_list.append(text)

output_dict = {'index':index_list, 'text':text_list}
text_output = pd.DataFrame(output_dict)

#第二步:找出30种模式并分类
text_output['head'] = text_output['text'].apply(lambda x:x[:10])
text_output['tail'] = text_output['text'].apply(lambda x:x[-5:])

head_pivot = text_output.pivot_table(index='head', values='text', aggfunc='count')
head_list = head_pivot[head_pivot['text']>1600].index.to_list() #这里的1600需要根据实际情况调整

head_mode_dict = {}
for i, ele in enumerate(head_list):
    head_mode_dict[ele] = i+1
    
text_output['head_mode_type'] = text_output['head'].apply(lambda x: head_mode_dict[x] if x in head_list else '')
text_tail = text_output[text_output['head_mode_type']=='']
tail_list = text_tail.pivot_table(index='tail', values='text', aggfunc='count').index.to_list()

tail_mode_dict = {}
for i in range(len(tail_list)):
    tail_mode_dict[tail_list[i]] = len(head_mode_dict)+i+1

text_output['tail_mode_type'] = text_output[text_output['head_mode_type']=='']['tail'].apply(lambda x:tail_mode_dict[x] if x in tail_list else '')

def get_mode_type(df):
    '''获得模板序号'''
    if df['head_mode_type']!='':
        return df['head_mode_type']
    else:
        return df['tail_mode_type']

text_output['mode_type'] = text_output.apply(get_mode_type, axis=1)

#第三步:寻找公共因子
def get_two_text(line_a, line_b):
    '''获取两个文本之前和之后共同的部分'''
    wrong_list = ['区', '号', '道', '街', '州', '县','市','路']
    and_list = ['和', '或', '、', '改', '请到']
    min_length = min(len(line_a), len(line_b))
    before_text, middle_text, after_text = '', '', ''
    if line_a.startswith(line_b):
        before_text = line_b
    elif line_a.endswith(line_b):
        after_text = line_b
    elif line_b.startswith(line_a):
        before_text = line_a
    elif line_b.endswith(line_a):
        after_text = line_a
    else:
        for i in range(1, min_length):
            if line_a[:i] == line_b[:i] and not line_a[i-1].isdigit():
                if line_a[i] in wrong_list and line_a[i:i+2] != '市场':
                    continue
                else:
                    before_text = line_a[:i]
        for j in range(min_length, 0, -1):
            if line_a[j-min_length:] == line_b[j-min_length:] and not line_a[j-min_length].isdigit():
                if line_a[j-min_length] in wrong_list and line_a[j-min_length:j-min_length+2] != '市场':   
                    continue
                else:
                    after_text = line_a[j-min_length:]
        new_line_a = line_a.replace(before_text, '').replace(after_text, '')
        new_line_b = line_b.replace(before_text, '').replace(after_text, '')

        #这边开始提取middle_text
        if (new_line_a == new_line_b) or not new_line_a or not new_line_b:
            pass
        elif new_line_a[0] in new_line_b:
            for k in range(1, len(new_line_a)+1):
                if new_line_a[:k] in new_line_b:
                    if k==1 and new_line_a[:k] in and_list:
                        middle_text = new_line_a[:k]
                    elif k>1:
                        middle_text = new_line_a[:k]
                else:
                    break
        elif new_line_b[0] in new_line_a:
            for k in range(1, len(new_line_b)+1):
                if new_line_b[:k] in new_line_a:
                    if k==1 and new_line_b[:k] in and_list:
                        middle_text = new_line_b[:k]
                    elif k>1:
                        middle_text = new_line_b[:k]
                else:
                    break
            #有种特别复杂的三因素情况
            if len(middle_text) > 1 and middle_text[-1] in and_list:
                new_line_a2 = new_line_a[(new_line_a.find(middle_text)+len(middle_text)):]
                new_line_b2 = new_line_b[len(middle_text):]
                temp_f, temp_m, temp_e = get_two_text(new_line_a2, new_line_b2)
                if temp_f == '' and temp_e == '' and temp_m != '':
                    middle_text = middle_text + '(.*?)' + temp_m

        elif new_line_a[-1] in new_line_b:
            for k in range(len(new_line_a),0,-1):
                if new_line_a[k-len(new_line_a):] in new_line_b and new_line_a[k-len(new_line_a)] not in wrong_list and not new_line_a[k-len(new_line_a)].isdigit():
                    middle_text = new_line_a[k-len(new_line_a):]
                else:
                    break
        elif new_line_b[-1] in new_line_a:
            for k in range(len(new_line_b),0,-1):
                if new_line_b[k-len(new_line_b):] in new_line_a and new_line_b[k-len(new_line_b)] not in wrong_list and not new_line_b[k-len(new_line_b)].isdigit():
                    middle_text = new_line_b[k-len(new_line_b):]
                else:
                    break
        else:
            for ele in and_list:
                if ele in new_line_a and ele in new_line_b:
                    middle_text = ele
                    break
    return before_text, middle_text, after_text

def list_in_text(text):
    '''判断text中是否有list的元素'''
    and_list = ['和', '或', '、', '改', '请到']
    for ele in and_list:
        if ele in text:
            return True
    else:
        return False
    
def get_html_list(text):
    '''获得只有空格相隔的html文件的list'''
    temp_text_list = text.split(' ')
    new_text_list = []
    for ele in temp_text_list:
        if ele != '':
            new_text_list.append(ele)
    return new_text_list

def get_compile(texta, textb):
    '''获得正则匹配的模式'''
    a_list = re.split(r'[,,。!!;;]', texta)
    b_list = re.split(r'[,,。!!;;]', textb)
    if len(a_list) == 1 and len(b_list) == 1:
        a_list = get_html_list(texta)
        b_list = get_html_list(textb)
    re_list = []
    for i in range(len(a_list)):
        if a_list[i] == b_list[i]:
            re_list.append(a_list[i])
            continue
        else:
            before_text, middle_text, after_text = get_two_text(a_list[i], b_list[i])
            if middle_text == '':
                re_compile = before_text + '(.*?)'+after_text
            else:
                re_compile = before_text + '(.*?)'+middle_text + '(.*?)'+ after_text
            re_list.append(re_compile)
    return re_list
    
def get_mode_compile(mode_num):
    '''获得某个模式的正确正则'''
    this_mode_df = text_output[text_output['mode_type']==mode_num].reset_index(drop=True)
    for i in range(1, this_mode_df.shape[0]):
        texta = this_mode_df.iloc[0,1]
        textb = this_mode_df.iloc[i,1]
        if i==1:
            this_mode_compile = get_compile(texta, textb)
        else:
            new_mode_compile = get_compile(texta, textb)
            for j in range(len(this_mode_compile)):
                if '(.*?)' in this_mode_compile[j]:
                    if new_mode_compile[j].count('(.*?)') > this_mode_compile[j].count('(.*?)') and list_in_text(new_mode_compile[j]):
                        this_mode_compile[j] = new_mode_compile[j]
                    elif '(.*?)' in new_mode_compile[j] and (new_mode_compile[j].count('(.*?)') == this_mode_compile[j].count('(.*?)')) and len(new_mode_compile[j]) < len(this_mode_compile[j]):
                        this_mode_compile[j] = new_mode_compile[j]
                elif '(.*?)' in new_mode_compile[j]:
                    this_mode_compile[j] = new_mode_compile[j]
                else:
                    continue
    return this_mode_compile

#获取每一个模板的正则化模式
mode_count = len(text_output['mode_type'].value_counts().index)
mode_dict = {}
for mode_num in range(1, mode_count+1):
    mode_dict[mode_num] = get_mode_compile(mode_num)

text_output['mode'] = text_output['mode_type'].apply(lambda x:mode_dict[x])
text_output[['index', 'text', 'mode', 'mode_type']].to_csv('5万条数据(含模式匹配).csv',encoding='gbk', index=False)

Top