You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

408 lines
16 KiB

import pandas as pd
import re
from difflib import SequenceMatcher
from collections import Counter
import chardet
def detect_file_encoding(file_path):
"""检测文件编码"""
with open(file_path, 'rb') as f:
raw_data = f.read(10000) # 读取前10KB来检测编码
result = chardet.detect(raw_data)
return result['encoding']
def safe_read_csv(file_path):
"""安全读取CSV文件,自动检测编码"""
# 尝试多种编码方式
encodings = ['utf-8', 'gbk', 'gb2312', 'utf-8-sig', 'latin1']
# 首先尝试自动检测编码
try:
detected_encoding = detect_file_encoding(file_path)
if detected_encoding:
encodings.insert(0, detected_encoding)
print(f"检测到文件编码: {detected_encoding}")
except:
print("编码检测失败,使用默认编码列表")
# 尝试不同编码读取文件
for encoding in encodings:
try:
print(f"尝试使用编码 {encoding} 读取文件...")
df = pd.read_csv(file_path, encoding=encoding)
print(f"成功使用编码 {encoding} 读取文件")
return df
except UnicodeDecodeError:
print(f"编码 {encoding} 失败")
continue
except Exception as e:
print(f"使用编码 {encoding} 时出现其他错误: {e}")
continue
# 如果所有编码都失败,尝试忽略错误的方式
try:
print("尝试使用 utf-8 编码并忽略错误...")
df = pd.read_csv(file_path, encoding='utf-8', errors='ignore')
print("成功读取文件(忽略了一些字符)")
return df
except Exception as e:
raise Exception(f"无法读取文件 {file_path}: {e}")
def clean_text(text):
# 统一换行符和空格
text = re.sub(r'\r\n|\r|\n', ' ', text)
text = re.sub(r'\s+', ' ', text) # 多个空格合并为一个
# 去除HTML标签(如果存在)
text = re.sub(r'<[^>]+>', '', text)
# 【修改点】保留中文、英文、数字、标点符号 (增加了顿号 `、`)
text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9、,%.。!?;:""''()【】\-\s]', '', text)
# 标点符号规范化 (初次)
punctuation_map = {
',,': '',
'..': '',
',。': '',
',。': '',
'!!': '',
'??': '',
';;': ''
}
for old, new in punctuation_map.items():
text = text.replace(old, new)
return text.strip()
def remove_paragraph_duplicates(text, similarity_threshold=0.85):
"""
段落级别去重:基于相似度去除重复段落
"""
paragraphs = text.split('')
paragraphs = [p.strip() for p in paragraphs if p.strip() and len(p) > 0]
unique_paragraphs = []
removed_paragraphs = []
for paragraph in paragraphs:
is_similar = False
for existing in unique_paragraphs:
similarity = SequenceMatcher(None, paragraph, existing).ratio()
if similarity > similarity_threshold:
is_similar = True
if len(paragraph) > len(existing):
removed_paragraphs.append(f"段落替换: {existing[:50]}...")
unique_paragraphs[unique_paragraphs.index(existing)] = paragraph
else:
removed_paragraphs.append(f"段落重复: {paragraph[:50]}...")
break
if not is_similar:
unique_paragraphs.append(paragraph)
return ''.join(unique_paragraphs), removed_paragraphs
def remove_sentence_duplicates(text, similarity_threshold=0.9):
"""
句子级别去重:去除重复的句子
"""
# 句子切分时,也可以考虑加入顿号,但这可能会切分得过细,这里暂时不修改
sentences = re.split(r'[。!?;]', text)
sentences = [s.strip() for s in sentences if s.strip() and len(s) > 0]
unique_sentences = []
removed_sentences = []
for sentence in sentences:
is_duplicate = False
for existing in unique_sentences:
similarity = SequenceMatcher(None, sentence, existing).ratio()
if similarity > similarity_threshold:
is_duplicate = True
if len(sentence) > len(existing):
removed_sentences.append(f"句子替换: {existing[:30]}...")
unique_sentences[unique_sentences.index(existing)] = sentence
else:
removed_sentences.append(f"句子重复: {sentence[:30]}...")
break
if not is_duplicate:
unique_sentences.append(sentence)
result = []
for sentence in unique_sentences:
if sentence:
if any(word in sentence for word in ['', '提醒', '注意', '防止']):
result.append(sentence + '')
elif '' in sentence or sentence.endswith('') or sentence.endswith(''):
result.append(sentence + '')
elif any(word in sentence for word in ['', '重要', '紧急', '警告']):
result.append(sentence + '')
else:
result.append(sentence + '')
return ''.join(result), removed_sentences
def remove_phrase_duplicates(text, min_phrase_length=4, max_phrase_length=20):
"""
短语级别去重:去除重复的短语和词组
"""
words = re.findall(r'[\u4e00-\u9fa5a-zA-Z0-9]+', text)
phrases = []
for n in range(min_phrase_length, min(max_phrase_length + 1, len(words) + 1)):
for i in range(len(words) - n + 1):
phrase = ''.join(words[i:i + n])
if len(phrase) >= min_phrase_length:
phrases.append(phrase)
phrase_counts = Counter(phrases)
frequent_phrases = [(phrase, count) for phrase, count in phrase_counts.items()
if count >= 3 and len(phrase) >= 6]
cleaned_text = text
removed_phrases = []
for phrase, count in sorted(frequent_phrases, key=lambda x: len(x[0]), reverse=True):
if phrase in cleaned_text:
first_occurrence = cleaned_text.find(phrase)
remaining_text = cleaned_text[first_occurrence + len(phrase):]
removed_count = remaining_text.count(phrase)
if removed_count > 0:
cleaned_text = cleaned_text[:first_occurrence + len(phrase)] + remaining_text.replace(phrase, '')
removed_phrases.append(f"短语重复({removed_count}次): {phrase}")
return cleaned_text, removed_phrases
def comprehensive_deduplication(text):
"""
综合去重:按层级顺序进行多级别去重,并在最后进行标点规范化
"""
original_length = len(text)
# 1. 段落级别去重
print("1. 执行段落级别去重...")
text, paragraph_removed = remove_paragraph_duplicates(text, 0.85)
paragraph_length = len(text)
print(f" 段落去重后长度: {paragraph_length} (减少 {original_length - paragraph_length} 字符)")
# 2. 句子级别去重
print("2. 执行句子级别去重...")
text, sentence_removed = remove_sentence_duplicates(text, 0.9)
sentence_length = len(text)
print(f" 句子去重后长度: {sentence_length} (减少 {paragraph_length - sentence_length} 字符)")
# 3. 短语级别去重
print("3. 执行短语级别去重...")
text, phrase_removed = remove_phrase_duplicates(text, 4, 15)
phrase_length = len(text)
print(f" 短语去重后长度: {phrase_length} (减少 {sentence_length - phrase_length} 字符)")
# 4. 最终标点符号规范化
print("4. 执行最终标点符号规范化...")
punctuation_map = {
',,': '',
'..': '',
',。': '',
',。': '',
'!!': '',
'??': '',
';;': ''
}
final_text = text
for old, new in punctuation_map.items():
final_text = final_text.replace(old, new)
final_length = len(final_text)
print(f" 最终规范化后长度: {final_length} (减少 {phrase_length - final_length} 字符)")
# 生成详细报告
report = {
'original_length': original_length,
'after_paragraph': paragraph_length,
'after_sentence': sentence_length,
'after_phrase': phrase_length,
'final_length': final_length,
'total_reduction': original_length - final_length,
'reduction_ratio': (original_length - final_length) / original_length if original_length > 0 else 0,
'removed_items': {
'paragraphs': paragraph_removed,
'sentences': sentence_removed,
'phrases': phrase_removed
}
}
return final_text, report
# 主处理流程
def main():
print("开始多级别去重处理...\n")
# 读取CSV文件
try:
df = safe_read_csv('merged.csv')
except Exception as e:
print(f"读取CSV文件失败: {e}")
return
print(f"读取到CSV文件,共 {len(df)} 行数据")
print(f"CSV文件列名: {list(df.columns)}")
if 'id' in df.columns:
print(f"可用的ID列表: {sorted(df['id'].unique())}")
else:
print("警告:CSV文件中没有找到'id'")
print("请检查CSV文件格式")
return
# 准备结果列表
all_results = []
all_reports = []
# 遍历所有ID
for current_id in sorted(df['id'].unique()):
print(f"\n{'=' * 50}")
print(f"处理ID: {current_id}")
print(f"{'=' * 50}")
target_row = df[df['id'] == current_id]
if len(target_row) == 0:
print(f"警告:没有找到ID={current_id}的数据")
continue
if 'merged_text' not in target_row.columns:
print(f"错误:找不到merged_text列")
continue
original_text = target_row['merged_text'].iloc[0]
if pd.isna(original_text) or str(original_text).strip() == '':
print(f"警告:ID={current_id}的merged_text为空,跳过处理")
all_results.append({
'id': current_id, 'original_text': '', 'cleaned_text': '', 'final_processed_text': '',
'original_length': 0, 'cleaned_length': 0, 'final_length': 0, 'paragraph_reduction': 0,
'sentence_reduction': 0, 'phrase_reduction': 0, 'punctuation_reduction': 0,
'total_reduction': 0, 'reduction_ratio': 0
})
continue
print(f"原始文本长度: {len(original_text)} 字符")
try:
print("执行基础文本清洗...")
cleaned_text = clean_text(str(original_text))
print(f"清洗后文本长度: {len(cleaned_text)} 字符")
final_text, dedup_report = comprehensive_deduplication(cleaned_text)
print(f"处理完成")
print(f"总体压缩比: {dedup_report['reduction_ratio']:.2%}")
print(f"最终文本长度: {dedup_report['final_length']} 字符")
result_record = {
'id': current_id,
'original_text': original_text,
'cleaned_text': cleaned_text,
'final_processed_text': final_text,
'original_length': len(str(original_text)),
'cleaned_length': len(cleaned_text),
'final_length': len(final_text),
'paragraph_reduction': dedup_report['original_length'] - dedup_report['after_paragraph'],
'sentence_reduction': dedup_report['after_paragraph'] - dedup_report['after_sentence'],
'phrase_reduction': dedup_report['after_sentence'] - dedup_report['after_phrase'],
'punctuation_reduction': dedup_report['after_phrase'] - dedup_report['final_length'],
'total_reduction': dedup_report['total_reduction'],
'reduction_ratio': dedup_report['reduction_ratio']
}
all_results.append(result_record)
all_reports.append((current_id, dedup_report))
except Exception as e:
print(f"处理ID={current_id}时出错: {str(e)}")
all_results.append({
'id': current_id, 'original_text': str(original_text), 'cleaned_text': '', 'final_processed_text': '',
'original_length': len(str(original_text)), 'cleaned_length': 0, 'final_length': 0,
'paragraph_reduction': 0, 'sentence_reduction': 0, 'phrase_reduction': 0, 'punctuation_reduction': 0,
'total_reduction': 0, 'reduction_ratio': 0
})
print(f"\n{'=' * 60}")
print("所有ID处理完成!")
print(f"{'=' * 60}")
result_df = pd.DataFrame(all_results)
print(f"总共处理: {len(all_results)} 个ID")
print(f"成功处理: {len([r for r in all_results if r['final_length'] > 0])} 个ID")
print(f"处理失败或跳过: {len([r for r in all_results if r['final_length'] == 0])} 个ID")
if len(all_results) > 0:
avg_reduction = result_df['reduction_ratio'].mean()
print(f"平均压缩比: {avg_reduction:.2%}")
print(f"总原始字符数: {result_df['original_length'].sum()}")
print(f"总最终字符数: {result_df['final_length'].sum()}")
try:
result_df.to_csv('batch_deduplication_results_619-1103_01.csv', index=False, encoding='utf-8-sig')
print("结果已保存到: batch_deduplication_results_619-1103_01.csv")
except Exception as e:
print(f"保存结果CSV时出错: {e}")
try:
with open('batch_deduplication_report_619-1103_01.txt', 'w', encoding='utf-8') as f:
f.write("=== 批量多级别去重详细报告 ===\n\n")
f.write(f"处理日期: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
f.write(f"总共处理: {len(all_results)} 个ID\n")
f.write(f"成功处理: {len([r for r in all_results if r['final_length'] > 0])} 个ID\n\n")
if len(all_results) > 0:
f.write("总体统计:\n")
f.write(f"- 平均压缩比: {result_df['reduction_ratio'].mean():.2%}\n")
f.write(f"- 总原始字符数: {result_df['original_length'].sum():,}\n")
f.write(f"- 总最终字符数: {result_df['final_length'].sum():,}\n")
f.write(f"- 总减少字符数: {result_df['total_reduction'].sum():,}\n\n")
for id_num, report in all_reports:
f.write(f"\n--- ID {id_num} 详细报告 ---\n")
f.write(f"原始文本长度: {report['original_length']} 字符\n")
f.write(f"最终文本长度: {report['final_length']} 字符\n")
f.write(f"总体压缩比: {report['reduction_ratio']:.2%}\n")
f.write("各级别处理效果:\n")
f.write(f"1. 段落级去重: 减少 {report['original_length'] - report['after_paragraph']} 字符\n")
f.write(f"2. 句子级去重: 减少 {report['after_paragraph'] - report['after_sentence']} 字符\n")
f.write(f"3. 短语级去重: 减少 {report['after_sentence'] - report['after_phrase']} 字符\n")
f.write(f"4. 最终标点规范化: 减少 {report['after_phrase'] - report['final_length']} 字符\n")
for level, items in report['removed_items'].items():
if items:
f.write(f"{level.upper()}级别移除了 {len(items)} 项内容\n")
print("详细报告已保存到: batch_deduplication_report_619-1103.txt")
except Exception as e:
print(f"保存报告时出错: {e}")
print(f"\n结果预览:")
print(result_df[['id', 'original_length', 'final_length', 'reduction_ratio']].head(10))
if __name__ == "__main__":
main()