chaitiao_and_ASR/2.content-merged清洗/merged_results清洗.py

import pandas as pd
import re
from difflib import SequenceMatcher
from collections import Counter
import chardet


def detect_file_encoding(file_path):
    """检测文件编码"""
    with open(file_path, 'rb') as f:
        raw_data = f.read(10000)  # 读取前10KB来检测编码
        result = chardet.detect(raw_data)
        return result['encoding']


def safe_read_csv(file_path):
    """安全读取CSV文件，自动检测编码"""
    # 尝试多种编码方式
    encodings = ['utf-8', 'gbk', 'gb2312', 'utf-8-sig', 'latin1']

    # 首先尝试自动检测编码
    try:
        detected_encoding = detect_file_encoding(file_path)
        if detected_encoding:
            encodings.insert(0, detected_encoding)
            print(f"检测到文件编码: {detected_encoding}")
    except:
        print("编码检测失败，使用默认编码列表")

    # 尝试不同编码读取文件
    for encoding in encodings:
        try:
            print(f"尝试使用编码 {encoding} 读取文件...")
            df = pd.read_csv(file_path, encoding=encoding)
            print(f"成功使用编码 {encoding} 读取文件")
            return df
        except UnicodeDecodeError:
            print(f"编码 {encoding} 失败")
            continue
        except Exception as e:
            print(f"使用编码 {encoding} 时出现其他错误: {e}")
            continue

    # 如果所有编码都失败，尝试忽略错误的方式
    try:
        print("尝试使用 utf-8 编码并忽略错误...")
        df = pd.read_csv(file_path, encoding='utf-8', errors='ignore')
        print("成功读取文件（忽略了一些字符）")
        return df
    except Exception as e:
        raise Exception(f"无法读取文件 {file_path}: {e}")


def clean_text(text):
    # 统一换行符和空格
    text = re.sub(r'\r\n|\r|\n', ' ', text)
    text = re.sub(r'\s+', ' ', text)  # 多个空格合并为一个

    # 去除HTML标签（如果存在）
    text = re.sub(r'<[^>]+>', '', text)

    # 【修改点】保留中文、英文、数字、标点符号 (增加了顿号 `、`)
    text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9、，%.。！？；：""''（）【】\-\s]', '', text)

    # 标点符号规范化 (初次)
    punctuation_map = {
        ',,': '，',
        '..': '。',
        '，。': '。',
        ',。': '。',
        '!!': '！',
        '??': '？',
        '；；': '；'
    }

    for old, new in punctuation_map.items():
        text = text.replace(old, new)

    return text.strip()


def remove_paragraph_duplicates(text, similarity_threshold=0.85):
    """
    段落级别去重：基于相似度去除重复段落
    """
    paragraphs = text.split('。')
    paragraphs = [p.strip() for p in paragraphs if p.strip() and len(p) > 0]

    unique_paragraphs = []
    removed_paragraphs = []

    for paragraph in paragraphs:
        is_similar = False

        for existing in unique_paragraphs:
            similarity = SequenceMatcher(None, paragraph, existing).ratio()

            if similarity > similarity_threshold:
                is_similar = True
                if len(paragraph) > len(existing):
                    removed_paragraphs.append(f"段落替换: {existing[:50]}...")
                    unique_paragraphs[unique_paragraphs.index(existing)] = paragraph
                else:
                    removed_paragraphs.append(f"段落重复: {paragraph[:50]}...")
                break

        if not is_similar:
            unique_paragraphs.append(paragraph)

    return '。'.join(unique_paragraphs), removed_paragraphs


def remove_sentence_duplicates(text, similarity_threshold=0.9):
    """
    句子级别去重：去除重复的句子
    """
    # 句子切分时，也可以考虑加入顿号，但这可能会切分得过细，这里暂时不修改
    sentences = re.split(r'[。！？；]', text)
    sentences = [s.strip() for s in sentences if s.strip() and len(s) > 0]

    unique_sentences = []
    removed_sentences = []

    for sentence in sentences:
        is_duplicate = False

        for existing in unique_sentences:
            similarity = SequenceMatcher(None, sentence, existing).ratio()

            if similarity > similarity_threshold:
                is_duplicate = True
                if len(sentence) > len(existing):
                    removed_sentences.append(f"句子替换: {existing[:30]}...")
                    unique_sentences[unique_sentences.index(existing)] = sentence
                else:
                    removed_sentences.append(f"句子重复: {sentence[:30]}...")
                break

        if not is_duplicate:
            unique_sentences.append(sentence)

    result = []
    for sentence in unique_sentences:
        if sentence:
            if any(word in sentence for word in ['请', '提醒', '注意', '防止']):
                result.append(sentence + '。')
            elif '？' in sentence or sentence.endswith('吗') or sentence.endswith('呢'):
                result.append(sentence + '？')
            elif any(word in sentence for word in ['！', '重要', '紧急', '警告']):
                result.append(sentence + '！')
            else:
                result.append(sentence + '。')

    return ''.join(result), removed_sentences


def remove_phrase_duplicates(text, min_phrase_length=4, max_phrase_length=20):
    """
    短语级别去重：去除重复的短语和词组
    """
    words = re.findall(r'[\u4e00-\u9fa5a-zA-Z0-9]+', text)

    phrases = []
    for n in range(min_phrase_length, min(max_phrase_length + 1, len(words) + 1)):
        for i in range(len(words) - n + 1):
            phrase = ''.join(words[i:i + n])
            if len(phrase) >= min_phrase_length:
                phrases.append(phrase)

    phrase_counts = Counter(phrases)

    frequent_phrases = [(phrase, count) for phrase, count in phrase_counts.items()
                        if count >= 3 and len(phrase) >= 6]

    cleaned_text = text
    removed_phrases = []

    for phrase, count in sorted(frequent_phrases, key=lambda x: len(x[0]), reverse=True):
        if phrase in cleaned_text:
            first_occurrence = cleaned_text.find(phrase)
            remaining_text = cleaned_text[first_occurrence + len(phrase):]

            removed_count = remaining_text.count(phrase)
            if removed_count > 0:
                cleaned_text = cleaned_text[:first_occurrence + len(phrase)] + remaining_text.replace(phrase, '')
                removed_phrases.append(f"短语重复({removed_count}次): {phrase}")

    return cleaned_text, removed_phrases


def comprehensive_deduplication(text):
    """
    综合去重：按层级顺序进行多级别去重，并在最后进行标点规范化
    """
    original_length = len(text)

    # 1. 段落级别去重
    print("1. 执行段落级别去重...")
    text, paragraph_removed = remove_paragraph_duplicates(text, 0.85)
    paragraph_length = len(text)
    print(f"   段落去重后长度: {paragraph_length} (减少 {original_length - paragraph_length} 字符)")

    # 2. 句子级别去重
    print("2. 执行句子级别去重...")
    text, sentence_removed = remove_sentence_duplicates(text, 0.9)
    sentence_length = len(text)
    print(f"   句子去重后长度: {sentence_length} (减少 {paragraph_length - sentence_length} 字符)")

    # 3. 短语级别去重
    print("3. 执行短语级别去重...")
    text, phrase_removed = remove_phrase_duplicates(text, 4, 15)
    phrase_length = len(text)
    print(f"   短语去重后长度: {phrase_length} (减少 {sentence_length - phrase_length} 字符)")

    # 4. 最终标点符号规范化
    print("4. 执行最终标点符号规范化...")
    punctuation_map = {
        ',,': '，',
        '..': '。',
        '，。': '。',
        ',。': '。',
        '!!': '！',
        '??': '？',
        '；；': '；'
    }

    final_text = text
    for old, new in punctuation_map.items():
        final_text = final_text.replace(old, new)

    final_length = len(final_text)
    print(f"   最终规范化后长度: {final_length} (减少 {phrase_length - final_length} 字符)")

    # 生成详细报告
    report = {
        'original_length': original_length,
        'after_paragraph': paragraph_length,
        'after_sentence': sentence_length,
        'after_phrase': phrase_length,
        'final_length': final_length,
        'total_reduction': original_length - final_length,
        'reduction_ratio': (original_length - final_length) / original_length if original_length > 0 else 0,
        'removed_items': {
            'paragraphs': paragraph_removed,
            'sentences': sentence_removed,
            'phrases': phrase_removed
        }
    }

    return final_text, report


# 主处理流程
def main():
    print("开始多级别去重处理...\n")

    # 读取CSV文件
    try:
        df = safe_read_csv('merged.csv')
    except Exception as e:
        print(f"读取CSV文件失败: {e}")
        return

    print(f"读取到CSV文件，共 {len(df)} 行数据")
    print(f"CSV文件列名: {list(df.columns)}")

    if 'id' in df.columns:
        print(f"可用的ID列表: {sorted(df['id'].unique())}")
    else:
        print("警告：CSV文件中没有找到'id'列")
        print("请检查CSV文件格式")
        return

    # 准备结果列表
    all_results = []
    all_reports = []

    # 遍历所有ID
    for current_id in sorted(df['id'].unique()):
        print(f"\n{'=' * 50}")
        print(f"处理ID: {current_id}")
        print(f"{'=' * 50}")

        target_row = df[df['id'] == current_id]

        if len(target_row) == 0:
            print(f"警告：没有找到ID={current_id}的数据")
            continue

        if 'merged_text' not in target_row.columns:
            print(f"错误：找不到merged_text列")
            continue

        original_text = target_row['merged_text'].iloc[0]

        if pd.isna(original_text) or str(original_text).strip() == '':
            print(f"警告：ID={current_id}的merged_text为空，跳过处理")
            all_results.append({
                'id': current_id, 'original_text': '', 'cleaned_text': '', 'final_processed_text': '',
                'original_length': 0, 'cleaned_length': 0, 'final_length': 0, 'paragraph_reduction': 0,
                'sentence_reduction': 0, 'phrase_reduction': 0, 'punctuation_reduction': 0,
                'total_reduction': 0, 'reduction_ratio': 0
            })
            continue

        print(f"原始文本长度: {len(original_text)} 字符")

        try:
            print("执行基础文本清洗...")
            cleaned_text = clean_text(str(original_text))
            print(f"清洗后文本长度: {len(cleaned_text)} 字符")

            final_text, dedup_report = comprehensive_deduplication(cleaned_text)

            print(f"处理完成")
            print(f"总体压缩比: {dedup_report['reduction_ratio']:.2%}")
            print(f"最终文本长度: {dedup_report['final_length']} 字符")

            result_record = {
                'id': current_id,
                'original_text': original_text,
                'cleaned_text': cleaned_text,
                'final_processed_text': final_text,
                'original_length': len(str(original_text)),
                'cleaned_length': len(cleaned_text),
                'final_length': len(final_text),
                'paragraph_reduction': dedup_report['original_length'] - dedup_report['after_paragraph'],
                'sentence_reduction': dedup_report['after_paragraph'] - dedup_report['after_sentence'],
                'phrase_reduction': dedup_report['after_sentence'] - dedup_report['after_phrase'],
                'punctuation_reduction': dedup_report['after_phrase'] - dedup_report['final_length'],
                'total_reduction': dedup_report['total_reduction'],
                'reduction_ratio': dedup_report['reduction_ratio']
            }

            all_results.append(result_record)
            all_reports.append((current_id, dedup_report))

        except Exception as e:
            print(f"处理ID={current_id}时出错: {str(e)}")
            all_results.append({
                'id': current_id, 'original_text': str(original_text), 'cleaned_text': '', 'final_processed_text': '',
                'original_length': len(str(original_text)), 'cleaned_length': 0, 'final_length': 0,
                'paragraph_reduction': 0, 'sentence_reduction': 0, 'phrase_reduction': 0, 'punctuation_reduction': 0,
                'total_reduction': 0, 'reduction_ratio': 0
            })

    print(f"\n{'=' * 60}")
    print("所有ID处理完成！")
    print(f"{'=' * 60}")

    result_df = pd.DataFrame(all_results)

    print(f"总共处理: {len(all_results)} 个ID")
    print(f"成功处理: {len([r for r in all_results if r['final_length'] > 0])} 个ID")
    print(f"处理失败或跳过: {len([r for r in all_results if r['final_length'] == 0])} 个ID")

    if len(all_results) > 0:
        avg_reduction = result_df['reduction_ratio'].mean()
        print(f"平均压缩比: {avg_reduction:.2%}")
        print(f"总原始字符数: {result_df['original_length'].sum()}")
        print(f"总最终字符数: {result_df['final_length'].sum()}")

    try:
        result_df.to_csv('batch_deduplication_results_619-1103_01.csv', index=False, encoding='utf-8-sig')
        print("结果已保存到: batch_deduplication_results_619-1103_01.csv")
    except Exception as e:
        print(f"保存结果CSV时出错: {e}")

    try:
        with open('batch_deduplication_report_619-1103_01.txt', 'w', encoding='utf-8') as f:
            f.write("=== 批量多级别去重详细报告 ===\n\n")
            f.write(f"处理日期: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
            f.write(f"总共处理: {len(all_results)} 个ID\n")
            f.write(f"成功处理: {len([r for r in all_results if r['final_length'] > 0])} 个ID\n\n")

            if len(all_results) > 0:
                f.write("总体统计:\n")
                f.write(f"- 平均压缩比: {result_df['reduction_ratio'].mean():.2%}\n")
                f.write(f"- 总原始字符数: {result_df['original_length'].sum():,}\n")
                f.write(f"- 总最终字符数: {result_df['final_length'].sum():,}\n")
                f.write(f"- 总减少字符数: {result_df['total_reduction'].sum():,}\n\n")

            for id_num, report in all_reports:
                f.write(f"\n--- ID {id_num} 详细报告 ---\n")
                f.write(f"原始文本长度: {report['original_length']} 字符\n")
                f.write(f"最终文本长度: {report['final_length']} 字符\n")
                f.write(f"总体压缩比: {report['reduction_ratio']:.2%}\n")

                f.write("各级别处理效果:\n")
                f.write(f"1. 段落级去重: 减少 {report['original_length'] - report['after_paragraph']} 字符\n")
                f.write(f"2. 句子级去重: 减少 {report['after_paragraph'] - report['after_sentence']} 字符\n")
                f.write(f"3. 短语级去重: 减少 {report['after_sentence'] - report['after_phrase']} 字符\n")
                f.write(f"4. 最终标点规范化: 减少 {report['after_phrase'] - report['final_length']} 字符\n")

                for level, items in report['removed_items'].items():
                    if items:
                        f.write(f"{level.upper()}级别移除了 {len(items)} 项内容\n")

        print("详细报告已保存到: batch_deduplication_report_619-1103.txt")
    except Exception as e:
        print(f"保存报告时出错: {e}")

    print(f"\n结果预览:")
    print(result_df[['id', 'original_length', 'final_length', 'reduction_ratio']].head(10))


if __name__ == "__main__":
    main()