You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
408 lines
16 KiB
408 lines
16 KiB
import pandas as pd |
|
import re |
|
from difflib import SequenceMatcher |
|
from collections import Counter |
|
import chardet |
|
|
|
|
|
def detect_file_encoding(file_path): |
|
"""检测文件编码""" |
|
with open(file_path, 'rb') as f: |
|
raw_data = f.read(10000) # 读取前10KB来检测编码 |
|
result = chardet.detect(raw_data) |
|
return result['encoding'] |
|
|
|
|
|
def safe_read_csv(file_path): |
|
"""安全读取CSV文件,自动检测编码""" |
|
# 尝试多种编码方式 |
|
encodings = ['utf-8', 'gbk', 'gb2312', 'utf-8-sig', 'latin1'] |
|
|
|
# 首先尝试自动检测编码 |
|
try: |
|
detected_encoding = detect_file_encoding(file_path) |
|
if detected_encoding: |
|
encodings.insert(0, detected_encoding) |
|
print(f"检测到文件编码: {detected_encoding}") |
|
except: |
|
print("编码检测失败,使用默认编码列表") |
|
|
|
# 尝试不同编码读取文件 |
|
for encoding in encodings: |
|
try: |
|
print(f"尝试使用编码 {encoding} 读取文件...") |
|
df = pd.read_csv(file_path, encoding=encoding) |
|
print(f"成功使用编码 {encoding} 读取文件") |
|
return df |
|
except UnicodeDecodeError: |
|
print(f"编码 {encoding} 失败") |
|
continue |
|
except Exception as e: |
|
print(f"使用编码 {encoding} 时出现其他错误: {e}") |
|
continue |
|
|
|
# 如果所有编码都失败,尝试忽略错误的方式 |
|
try: |
|
print("尝试使用 utf-8 编码并忽略错误...") |
|
df = pd.read_csv(file_path, encoding='utf-8', errors='ignore') |
|
print("成功读取文件(忽略了一些字符)") |
|
return df |
|
except Exception as e: |
|
raise Exception(f"无法读取文件 {file_path}: {e}") |
|
|
|
|
|
def clean_text(text): |
|
# 统一换行符和空格 |
|
text = re.sub(r'\r\n|\r|\n', ' ', text) |
|
text = re.sub(r'\s+', ' ', text) # 多个空格合并为一个 |
|
|
|
# 去除HTML标签(如果存在) |
|
text = re.sub(r'<[^>]+>', '', text) |
|
|
|
# 【修改点】保留中文、英文、数字、标点符号 (增加了顿号 `、`) |
|
text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9、,%.。!?;:""''()【】\-\s]', '', text) |
|
|
|
# 标点符号规范化 (初次) |
|
punctuation_map = { |
|
',,': ',', |
|
'..': '。', |
|
',。': '。', |
|
',。': '。', |
|
'!!': '!', |
|
'??': '?', |
|
';;': ';' |
|
} |
|
|
|
for old, new in punctuation_map.items(): |
|
text = text.replace(old, new) |
|
|
|
return text.strip() |
|
|
|
|
|
def remove_paragraph_duplicates(text, similarity_threshold=0.85): |
|
""" |
|
段落级别去重:基于相似度去除重复段落 |
|
""" |
|
paragraphs = text.split('。') |
|
paragraphs = [p.strip() for p in paragraphs if p.strip() and len(p) > 0] |
|
|
|
unique_paragraphs = [] |
|
removed_paragraphs = [] |
|
|
|
for paragraph in paragraphs: |
|
is_similar = False |
|
|
|
for existing in unique_paragraphs: |
|
similarity = SequenceMatcher(None, paragraph, existing).ratio() |
|
|
|
if similarity > similarity_threshold: |
|
is_similar = True |
|
if len(paragraph) > len(existing): |
|
removed_paragraphs.append(f"段落替换: {existing[:50]}...") |
|
unique_paragraphs[unique_paragraphs.index(existing)] = paragraph |
|
else: |
|
removed_paragraphs.append(f"段落重复: {paragraph[:50]}...") |
|
break |
|
|
|
if not is_similar: |
|
unique_paragraphs.append(paragraph) |
|
|
|
return '。'.join(unique_paragraphs), removed_paragraphs |
|
|
|
|
|
def remove_sentence_duplicates(text, similarity_threshold=0.9): |
|
""" |
|
句子级别去重:去除重复的句子 |
|
""" |
|
# 句子切分时,也可以考虑加入顿号,但这可能会切分得过细,这里暂时不修改 |
|
sentences = re.split(r'[。!?;]', text) |
|
sentences = [s.strip() for s in sentences if s.strip() and len(s) > 0] |
|
|
|
unique_sentences = [] |
|
removed_sentences = [] |
|
|
|
for sentence in sentences: |
|
is_duplicate = False |
|
|
|
for existing in unique_sentences: |
|
similarity = SequenceMatcher(None, sentence, existing).ratio() |
|
|
|
if similarity > similarity_threshold: |
|
is_duplicate = True |
|
if len(sentence) > len(existing): |
|
removed_sentences.append(f"句子替换: {existing[:30]}...") |
|
unique_sentences[unique_sentences.index(existing)] = sentence |
|
else: |
|
removed_sentences.append(f"句子重复: {sentence[:30]}...") |
|
break |
|
|
|
if not is_duplicate: |
|
unique_sentences.append(sentence) |
|
|
|
result = [] |
|
for sentence in unique_sentences: |
|
if sentence: |
|
if any(word in sentence for word in ['请', '提醒', '注意', '防止']): |
|
result.append(sentence + '。') |
|
elif '?' in sentence or sentence.endswith('吗') or sentence.endswith('呢'): |
|
result.append(sentence + '?') |
|
elif any(word in sentence for word in ['!', '重要', '紧急', '警告']): |
|
result.append(sentence + '!') |
|
else: |
|
result.append(sentence + '。') |
|
|
|
return ''.join(result), removed_sentences |
|
|
|
|
|
def remove_phrase_duplicates(text, min_phrase_length=4, max_phrase_length=20): |
|
""" |
|
短语级别去重:去除重复的短语和词组 |
|
""" |
|
words = re.findall(r'[\u4e00-\u9fa5a-zA-Z0-9]+', text) |
|
|
|
phrases = [] |
|
for n in range(min_phrase_length, min(max_phrase_length + 1, len(words) + 1)): |
|
for i in range(len(words) - n + 1): |
|
phrase = ''.join(words[i:i + n]) |
|
if len(phrase) >= min_phrase_length: |
|
phrases.append(phrase) |
|
|
|
phrase_counts = Counter(phrases) |
|
|
|
frequent_phrases = [(phrase, count) for phrase, count in phrase_counts.items() |
|
if count >= 3 and len(phrase) >= 6] |
|
|
|
cleaned_text = text |
|
removed_phrases = [] |
|
|
|
for phrase, count in sorted(frequent_phrases, key=lambda x: len(x[0]), reverse=True): |
|
if phrase in cleaned_text: |
|
first_occurrence = cleaned_text.find(phrase) |
|
remaining_text = cleaned_text[first_occurrence + len(phrase):] |
|
|
|
removed_count = remaining_text.count(phrase) |
|
if removed_count > 0: |
|
cleaned_text = cleaned_text[:first_occurrence + len(phrase)] + remaining_text.replace(phrase, '') |
|
removed_phrases.append(f"短语重复({removed_count}次): {phrase}") |
|
|
|
return cleaned_text, removed_phrases |
|
|
|
|
|
def comprehensive_deduplication(text): |
|
""" |
|
综合去重:按层级顺序进行多级别去重,并在最后进行标点规范化 |
|
""" |
|
original_length = len(text) |
|
|
|
# 1. 段落级别去重 |
|
print("1. 执行段落级别去重...") |
|
text, paragraph_removed = remove_paragraph_duplicates(text, 0.85) |
|
paragraph_length = len(text) |
|
print(f" 段落去重后长度: {paragraph_length} (减少 {original_length - paragraph_length} 字符)") |
|
|
|
# 2. 句子级别去重 |
|
print("2. 执行句子级别去重...") |
|
text, sentence_removed = remove_sentence_duplicates(text, 0.9) |
|
sentence_length = len(text) |
|
print(f" 句子去重后长度: {sentence_length} (减少 {paragraph_length - sentence_length} 字符)") |
|
|
|
# 3. 短语级别去重 |
|
print("3. 执行短语级别去重...") |
|
text, phrase_removed = remove_phrase_duplicates(text, 4, 15) |
|
phrase_length = len(text) |
|
print(f" 短语去重后长度: {phrase_length} (减少 {sentence_length - phrase_length} 字符)") |
|
|
|
# 4. 最终标点符号规范化 |
|
print("4. 执行最终标点符号规范化...") |
|
punctuation_map = { |
|
',,': ',', |
|
'..': '。', |
|
',。': '。', |
|
',。': '。', |
|
'!!': '!', |
|
'??': '?', |
|
';;': ';' |
|
} |
|
|
|
final_text = text |
|
for old, new in punctuation_map.items(): |
|
final_text = final_text.replace(old, new) |
|
|
|
final_length = len(final_text) |
|
print(f" 最终规范化后长度: {final_length} (减少 {phrase_length - final_length} 字符)") |
|
|
|
# 生成详细报告 |
|
report = { |
|
'original_length': original_length, |
|
'after_paragraph': paragraph_length, |
|
'after_sentence': sentence_length, |
|
'after_phrase': phrase_length, |
|
'final_length': final_length, |
|
'total_reduction': original_length - final_length, |
|
'reduction_ratio': (original_length - final_length) / original_length if original_length > 0 else 0, |
|
'removed_items': { |
|
'paragraphs': paragraph_removed, |
|
'sentences': sentence_removed, |
|
'phrases': phrase_removed |
|
} |
|
} |
|
|
|
return final_text, report |
|
|
|
|
|
# 主处理流程 |
|
def main(): |
|
print("开始多级别去重处理...\n") |
|
|
|
# 读取CSV文件 |
|
try: |
|
df = safe_read_csv('merged.csv') |
|
except Exception as e: |
|
print(f"读取CSV文件失败: {e}") |
|
return |
|
|
|
print(f"读取到CSV文件,共 {len(df)} 行数据") |
|
print(f"CSV文件列名: {list(df.columns)}") |
|
|
|
if 'id' in df.columns: |
|
print(f"可用的ID列表: {sorted(df['id'].unique())}") |
|
else: |
|
print("警告:CSV文件中没有找到'id'列") |
|
print("请检查CSV文件格式") |
|
return |
|
|
|
# 准备结果列表 |
|
all_results = [] |
|
all_reports = [] |
|
|
|
# 遍历所有ID |
|
for current_id in sorted(df['id'].unique()): |
|
print(f"\n{'=' * 50}") |
|
print(f"处理ID: {current_id}") |
|
print(f"{'=' * 50}") |
|
|
|
target_row = df[df['id'] == current_id] |
|
|
|
if len(target_row) == 0: |
|
print(f"警告:没有找到ID={current_id}的数据") |
|
continue |
|
|
|
if 'merged_text' not in target_row.columns: |
|
print(f"错误:找不到merged_text列") |
|
continue |
|
|
|
original_text = target_row['merged_text'].iloc[0] |
|
|
|
if pd.isna(original_text) or str(original_text).strip() == '': |
|
print(f"警告:ID={current_id}的merged_text为空,跳过处理") |
|
all_results.append({ |
|
'id': current_id, 'original_text': '', 'cleaned_text': '', 'final_processed_text': '', |
|
'original_length': 0, 'cleaned_length': 0, 'final_length': 0, 'paragraph_reduction': 0, |
|
'sentence_reduction': 0, 'phrase_reduction': 0, 'punctuation_reduction': 0, |
|
'total_reduction': 0, 'reduction_ratio': 0 |
|
}) |
|
continue |
|
|
|
print(f"原始文本长度: {len(original_text)} 字符") |
|
|
|
try: |
|
print("执行基础文本清洗...") |
|
cleaned_text = clean_text(str(original_text)) |
|
print(f"清洗后文本长度: {len(cleaned_text)} 字符") |
|
|
|
final_text, dedup_report = comprehensive_deduplication(cleaned_text) |
|
|
|
print(f"处理完成") |
|
print(f"总体压缩比: {dedup_report['reduction_ratio']:.2%}") |
|
print(f"最终文本长度: {dedup_report['final_length']} 字符") |
|
|
|
result_record = { |
|
'id': current_id, |
|
'original_text': original_text, |
|
'cleaned_text': cleaned_text, |
|
'final_processed_text': final_text, |
|
'original_length': len(str(original_text)), |
|
'cleaned_length': len(cleaned_text), |
|
'final_length': len(final_text), |
|
'paragraph_reduction': dedup_report['original_length'] - dedup_report['after_paragraph'], |
|
'sentence_reduction': dedup_report['after_paragraph'] - dedup_report['after_sentence'], |
|
'phrase_reduction': dedup_report['after_sentence'] - dedup_report['after_phrase'], |
|
'punctuation_reduction': dedup_report['after_phrase'] - dedup_report['final_length'], |
|
'total_reduction': dedup_report['total_reduction'], |
|
'reduction_ratio': dedup_report['reduction_ratio'] |
|
} |
|
|
|
all_results.append(result_record) |
|
all_reports.append((current_id, dedup_report)) |
|
|
|
except Exception as e: |
|
print(f"处理ID={current_id}时出错: {str(e)}") |
|
all_results.append({ |
|
'id': current_id, 'original_text': str(original_text), 'cleaned_text': '', 'final_processed_text': '', |
|
'original_length': len(str(original_text)), 'cleaned_length': 0, 'final_length': 0, |
|
'paragraph_reduction': 0, 'sentence_reduction': 0, 'phrase_reduction': 0, 'punctuation_reduction': 0, |
|
'total_reduction': 0, 'reduction_ratio': 0 |
|
}) |
|
|
|
print(f"\n{'=' * 60}") |
|
print("所有ID处理完成!") |
|
print(f"{'=' * 60}") |
|
|
|
result_df = pd.DataFrame(all_results) |
|
|
|
print(f"总共处理: {len(all_results)} 个ID") |
|
print(f"成功处理: {len([r for r in all_results if r['final_length'] > 0])} 个ID") |
|
print(f"处理失败或跳过: {len([r for r in all_results if r['final_length'] == 0])} 个ID") |
|
|
|
if len(all_results) > 0: |
|
avg_reduction = result_df['reduction_ratio'].mean() |
|
print(f"平均压缩比: {avg_reduction:.2%}") |
|
print(f"总原始字符数: {result_df['original_length'].sum()}") |
|
print(f"总最终字符数: {result_df['final_length'].sum()}") |
|
|
|
try: |
|
result_df.to_csv('batch_deduplication_results_619-1103_01.csv', index=False, encoding='utf-8-sig') |
|
print("结果已保存到: batch_deduplication_results_619-1103_01.csv") |
|
except Exception as e: |
|
print(f"保存结果CSV时出错: {e}") |
|
|
|
try: |
|
with open('batch_deduplication_report_619-1103_01.txt', 'w', encoding='utf-8') as f: |
|
f.write("=== 批量多级别去重详细报告 ===\n\n") |
|
f.write(f"处理日期: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}\n") |
|
f.write(f"总共处理: {len(all_results)} 个ID\n") |
|
f.write(f"成功处理: {len([r for r in all_results if r['final_length'] > 0])} 个ID\n\n") |
|
|
|
if len(all_results) > 0: |
|
f.write("总体统计:\n") |
|
f.write(f"- 平均压缩比: {result_df['reduction_ratio'].mean():.2%}\n") |
|
f.write(f"- 总原始字符数: {result_df['original_length'].sum():,}\n") |
|
f.write(f"- 总最终字符数: {result_df['final_length'].sum():,}\n") |
|
f.write(f"- 总减少字符数: {result_df['total_reduction'].sum():,}\n\n") |
|
|
|
for id_num, report in all_reports: |
|
f.write(f"\n--- ID {id_num} 详细报告 ---\n") |
|
f.write(f"原始文本长度: {report['original_length']} 字符\n") |
|
f.write(f"最终文本长度: {report['final_length']} 字符\n") |
|
f.write(f"总体压缩比: {report['reduction_ratio']:.2%}\n") |
|
|
|
f.write("各级别处理效果:\n") |
|
f.write(f"1. 段落级去重: 减少 {report['original_length'] - report['after_paragraph']} 字符\n") |
|
f.write(f"2. 句子级去重: 减少 {report['after_paragraph'] - report['after_sentence']} 字符\n") |
|
f.write(f"3. 短语级去重: 减少 {report['after_sentence'] - report['after_phrase']} 字符\n") |
|
f.write(f"4. 最终标点规范化: 减少 {report['after_phrase'] - report['final_length']} 字符\n") |
|
|
|
for level, items in report['removed_items'].items(): |
|
if items: |
|
f.write(f"{level.upper()}级别移除了 {len(items)} 项内容\n") |
|
|
|
print("详细报告已保存到: batch_deduplication_report_619-1103.txt") |
|
except Exception as e: |
|
print(f"保存报告时出错: {e}") |
|
|
|
print(f"\n结果预览:") |
|
print(result_df[['id', 'original_length', 'final_length', 'reduction_ratio']].head(10)) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |