import pandas as pd import re from difflib import SequenceMatcher from collections import Counter import chardet def detect_file_encoding(file_path): """检测文件编码""" with open(file_path, 'rb') as f: raw_data = f.read(10000) # 读取前10KB来检测编码 result = chardet.detect(raw_data) return result['encoding'] def safe_read_csv(file_path): """安全读取CSV文件,自动检测编码""" # 尝试多种编码方式 encodings = ['utf-8', 'gbk', 'gb2312', 'utf-8-sig', 'latin1'] # 首先尝试自动检测编码 try: detected_encoding = detect_file_encoding(file_path) if detected_encoding: encodings.insert(0, detected_encoding) print(f"检测到文件编码: {detected_encoding}") except: print("编码检测失败,使用默认编码列表") # 尝试不同编码读取文件 for encoding in encodings: try: print(f"尝试使用编码 {encoding} 读取文件...") df = pd.read_csv(file_path, encoding=encoding) print(f"成功使用编码 {encoding} 读取文件") return df except UnicodeDecodeError: print(f"编码 {encoding} 失败") continue except Exception as e: print(f"使用编码 {encoding} 时出现其他错误: {e}") continue # 如果所有编码都失败,尝试忽略错误的方式 try: print("尝试使用 utf-8 编码并忽略错误...") df = pd.read_csv(file_path, encoding='utf-8', errors='ignore') print("成功读取文件(忽略了一些字符)") return df except Exception as e: raise Exception(f"无法读取文件 {file_path}: {e}") def clean_text(text): # 统一换行符和空格 text = re.sub(r'\r\n|\r|\n', ' ', text) text = re.sub(r'\s+', ' ', text) # 多个空格合并为一个 # 去除HTML标签(如果存在) text = re.sub(r'<[^>]+>', '', text) # 【修改点】保留中文、英文、数字、标点符号 (增加了顿号 `、`) text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9、,%.。!?;:""''()【】\-\s]', '', text) # 标点符号规范化 (初次) punctuation_map = { ',,': ',', '..': '。', ',。': '。', ',。': '。', '!!': '!', '??': '?', ';;': ';' } for old, new in punctuation_map.items(): text = text.replace(old, new) return text.strip() def remove_paragraph_duplicates(text, similarity_threshold=0.85): """ 段落级别去重:基于相似度去除重复段落 """ paragraphs = text.split('。') paragraphs = [p.strip() for p in paragraphs if p.strip() and len(p) > 0] unique_paragraphs = [] removed_paragraphs = [] for paragraph in paragraphs: is_similar = False for existing in unique_paragraphs: similarity = SequenceMatcher(None, paragraph, existing).ratio() if similarity > similarity_threshold: is_similar = True if len(paragraph) > len(existing): removed_paragraphs.append(f"段落替换: {existing[:50]}...") unique_paragraphs[unique_paragraphs.index(existing)] = paragraph else: removed_paragraphs.append(f"段落重复: {paragraph[:50]}...") break if not is_similar: unique_paragraphs.append(paragraph) return '。'.join(unique_paragraphs), removed_paragraphs def remove_sentence_duplicates(text, similarity_threshold=0.9): """ 句子级别去重:去除重复的句子 """ # 句子切分时,也可以考虑加入顿号,但这可能会切分得过细,这里暂时不修改 sentences = re.split(r'[。!?;]', text) sentences = [s.strip() for s in sentences if s.strip() and len(s) > 0] unique_sentences = [] removed_sentences = [] for sentence in sentences: is_duplicate = False for existing in unique_sentences: similarity = SequenceMatcher(None, sentence, existing).ratio() if similarity > similarity_threshold: is_duplicate = True if len(sentence) > len(existing): removed_sentences.append(f"句子替换: {existing[:30]}...") unique_sentences[unique_sentences.index(existing)] = sentence else: removed_sentences.append(f"句子重复: {sentence[:30]}...") break if not is_duplicate: unique_sentences.append(sentence) result = [] for sentence in unique_sentences: if sentence: if any(word in sentence for word in ['请', '提醒', '注意', '防止']): result.append(sentence + '。') elif '?' in sentence or sentence.endswith('吗') or sentence.endswith('呢'): result.append(sentence + '?') elif any(word in sentence for word in ['!', '重要', '紧急', '警告']): result.append(sentence + '!') else: result.append(sentence + '。') return ''.join(result), removed_sentences def remove_phrase_duplicates(text, min_phrase_length=4, max_phrase_length=20): """ 短语级别去重:去除重复的短语和词组 """ words = re.findall(r'[\u4e00-\u9fa5a-zA-Z0-9]+', text) phrases = [] for n in range(min_phrase_length, min(max_phrase_length + 1, len(words) + 1)): for i in range(len(words) - n + 1): phrase = ''.join(words[i:i + n]) if len(phrase) >= min_phrase_length: phrases.append(phrase) phrase_counts = Counter(phrases) frequent_phrases = [(phrase, count) for phrase, count in phrase_counts.items() if count >= 3 and len(phrase) >= 6] cleaned_text = text removed_phrases = [] for phrase, count in sorted(frequent_phrases, key=lambda x: len(x[0]), reverse=True): if phrase in cleaned_text: first_occurrence = cleaned_text.find(phrase) remaining_text = cleaned_text[first_occurrence + len(phrase):] removed_count = remaining_text.count(phrase) if removed_count > 0: cleaned_text = cleaned_text[:first_occurrence + len(phrase)] + remaining_text.replace(phrase, '') removed_phrases.append(f"短语重复({removed_count}次): {phrase}") return cleaned_text, removed_phrases def comprehensive_deduplication(text): """ 综合去重:按层级顺序进行多级别去重,并在最后进行标点规范化 """ original_length = len(text) # 1. 段落级别去重 print("1. 执行段落级别去重...") text, paragraph_removed = remove_paragraph_duplicates(text, 0.85) paragraph_length = len(text) print(f" 段落去重后长度: {paragraph_length} (减少 {original_length - paragraph_length} 字符)") # 2. 句子级别去重 print("2. 执行句子级别去重...") text, sentence_removed = remove_sentence_duplicates(text, 0.9) sentence_length = len(text) print(f" 句子去重后长度: {sentence_length} (减少 {paragraph_length - sentence_length} 字符)") # 3. 短语级别去重 print("3. 执行短语级别去重...") text, phrase_removed = remove_phrase_duplicates(text, 4, 15) phrase_length = len(text) print(f" 短语去重后长度: {phrase_length} (减少 {sentence_length - phrase_length} 字符)") # 4. 最终标点符号规范化 print("4. 执行最终标点符号规范化...") punctuation_map = { ',,': ',', '..': '。', ',。': '。', ',。': '。', '!!': '!', '??': '?', ';;': ';' } final_text = text for old, new in punctuation_map.items(): final_text = final_text.replace(old, new) final_length = len(final_text) print(f" 最终规范化后长度: {final_length} (减少 {phrase_length - final_length} 字符)") # 生成详细报告 report = { 'original_length': original_length, 'after_paragraph': paragraph_length, 'after_sentence': sentence_length, 'after_phrase': phrase_length, 'final_length': final_length, 'total_reduction': original_length - final_length, 'reduction_ratio': (original_length - final_length) / original_length if original_length > 0 else 0, 'removed_items': { 'paragraphs': paragraph_removed, 'sentences': sentence_removed, 'phrases': phrase_removed } } return final_text, report # 主处理流程 def main(): print("开始多级别去重处理...\n") # 读取CSV文件 try: df = safe_read_csv('merged.csv') except Exception as e: print(f"读取CSV文件失败: {e}") return print(f"读取到CSV文件,共 {len(df)} 行数据") print(f"CSV文件列名: {list(df.columns)}") if 'id' in df.columns: print(f"可用的ID列表: {sorted(df['id'].unique())}") else: print("警告:CSV文件中没有找到'id'列") print("请检查CSV文件格式") return # 准备结果列表 all_results = [] all_reports = [] # 遍历所有ID for current_id in sorted(df['id'].unique()): print(f"\n{'=' * 50}") print(f"处理ID: {current_id}") print(f"{'=' * 50}") target_row = df[df['id'] == current_id] if len(target_row) == 0: print(f"警告:没有找到ID={current_id}的数据") continue if 'merged_text' not in target_row.columns: print(f"错误:找不到merged_text列") continue original_text = target_row['merged_text'].iloc[0] if pd.isna(original_text) or str(original_text).strip() == '': print(f"警告:ID={current_id}的merged_text为空,跳过处理") all_results.append({ 'id': current_id, 'original_text': '', 'cleaned_text': '', 'final_processed_text': '', 'original_length': 0, 'cleaned_length': 0, 'final_length': 0, 'paragraph_reduction': 0, 'sentence_reduction': 0, 'phrase_reduction': 0, 'punctuation_reduction': 0, 'total_reduction': 0, 'reduction_ratio': 0 }) continue print(f"原始文本长度: {len(original_text)} 字符") try: print("执行基础文本清洗...") cleaned_text = clean_text(str(original_text)) print(f"清洗后文本长度: {len(cleaned_text)} 字符") final_text, dedup_report = comprehensive_deduplication(cleaned_text) print(f"处理完成") print(f"总体压缩比: {dedup_report['reduction_ratio']:.2%}") print(f"最终文本长度: {dedup_report['final_length']} 字符") result_record = { 'id': current_id, 'original_text': original_text, 'cleaned_text': cleaned_text, 'final_processed_text': final_text, 'original_length': len(str(original_text)), 'cleaned_length': len(cleaned_text), 'final_length': len(final_text), 'paragraph_reduction': dedup_report['original_length'] - dedup_report['after_paragraph'], 'sentence_reduction': dedup_report['after_paragraph'] - dedup_report['after_sentence'], 'phrase_reduction': dedup_report['after_sentence'] - dedup_report['after_phrase'], 'punctuation_reduction': dedup_report['after_phrase'] - dedup_report['final_length'], 'total_reduction': dedup_report['total_reduction'], 'reduction_ratio': dedup_report['reduction_ratio'] } all_results.append(result_record) all_reports.append((current_id, dedup_report)) except Exception as e: print(f"处理ID={current_id}时出错: {str(e)}") all_results.append({ 'id': current_id, 'original_text': str(original_text), 'cleaned_text': '', 'final_processed_text': '', 'original_length': len(str(original_text)), 'cleaned_length': 0, 'final_length': 0, 'paragraph_reduction': 0, 'sentence_reduction': 0, 'phrase_reduction': 0, 'punctuation_reduction': 0, 'total_reduction': 0, 'reduction_ratio': 0 }) print(f"\n{'=' * 60}") print("所有ID处理完成!") print(f"{'=' * 60}") result_df = pd.DataFrame(all_results) print(f"总共处理: {len(all_results)} 个ID") print(f"成功处理: {len([r for r in all_results if r['final_length'] > 0])} 个ID") print(f"处理失败或跳过: {len([r for r in all_results if r['final_length'] == 0])} 个ID") if len(all_results) > 0: avg_reduction = result_df['reduction_ratio'].mean() print(f"平均压缩比: {avg_reduction:.2%}") print(f"总原始字符数: {result_df['original_length'].sum()}") print(f"总最终字符数: {result_df['final_length'].sum()}") try: result_df.to_csv('batch_deduplication_results_619-1103_01.csv', index=False, encoding='utf-8-sig') print("结果已保存到: batch_deduplication_results_619-1103_01.csv") except Exception as e: print(f"保存结果CSV时出错: {e}") try: with open('batch_deduplication_report_619-1103_01.txt', 'w', encoding='utf-8') as f: f.write("=== 批量多级别去重详细报告 ===\n\n") f.write(f"处理日期: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}\n") f.write(f"总共处理: {len(all_results)} 个ID\n") f.write(f"成功处理: {len([r for r in all_results if r['final_length'] > 0])} 个ID\n\n") if len(all_results) > 0: f.write("总体统计:\n") f.write(f"- 平均压缩比: {result_df['reduction_ratio'].mean():.2%}\n") f.write(f"- 总原始字符数: {result_df['original_length'].sum():,}\n") f.write(f"- 总最终字符数: {result_df['final_length'].sum():,}\n") f.write(f"- 总减少字符数: {result_df['total_reduction'].sum():,}\n\n") for id_num, report in all_reports: f.write(f"\n--- ID {id_num} 详细报告 ---\n") f.write(f"原始文本长度: {report['original_length']} 字符\n") f.write(f"最终文本长度: {report['final_length']} 字符\n") f.write(f"总体压缩比: {report['reduction_ratio']:.2%}\n") f.write("各级别处理效果:\n") f.write(f"1. 段落级去重: 减少 {report['original_length'] - report['after_paragraph']} 字符\n") f.write(f"2. 句子级去重: 减少 {report['after_paragraph'] - report['after_sentence']} 字符\n") f.write(f"3. 短语级去重: 减少 {report['after_sentence'] - report['after_phrase']} 字符\n") f.write(f"4. 最终标点规范化: 减少 {report['after_phrase'] - report['final_length']} 字符\n") for level, items in report['removed_items'].items(): if items: f.write(f"{level.upper()}级别移除了 {len(items)} 项内容\n") print("详细报告已保存到: batch_deduplication_report_619-1103.txt") except Exception as e: print(f"保存报告时出错: {e}") print(f"\n结果预览:") print(result_df[['id', 'original_length', 'final_length', 'reduction_ratio']].head(10)) if __name__ == "__main__": main()