chaitiao_and_ASR/2.content-merged清洗/封装/去重.py

import json
import re
import chardet
from difflib import SequenceMatcher
from collections import Counter
from typing import Union, List, Dict, Any
import os


class BroadcastDeduplicator:
    """广播去重处理类"""

    def __init__(self):
        pass

    def detect_file_encoding(self, file_path: str) -> str:
        with open(file_path, 'rb') as f:
            raw_data = f.read(10000)
            result = chardet.detect(raw_data)
            return result['encoding']

    def safe_read_json(self, file_path: str) -> Union[Dict, List]:
        encodings = ['utf-8', 'gbk', 'gb2312', 'utf-8-sig', 'latin1']

        try:
            detected_encoding = self.detect_file_encoding(file_path)
            if detected_encoding:
                encodings.insert(0, detected_encoding)
                print(f"检测到文件编码: {detected_encoding}")
        except:
            print("编码检测失败，使用默认编码列表")

        for encoding in encodings:
            try:
                print(f"尝试使用编码 {encoding} 读取文件...")
                with open(file_path, 'r', encoding=encoding) as f:
                    data = json.load(f)
                print(f"成功使用编码 {encoding} 读取文件")
                return data
            except UnicodeDecodeError:
                print(f"编码 {encoding} 失败")
                continue
            except json.JSONDecodeError as e:
                print(f"JSON格式错误: {e}")
                raise
            except Exception as e:
                print(f"使用编码 {encoding} 时出现其他错误: {e}")
                continue

        raise Exception(f"无法读取文件 {file_path}")

    def clean_text(self, text: str) -> str:
        if not isinstance(text, str):
            return str(text)

        text = re.sub(r'\r\n|\r|\n', ' ', text)
        text = re.sub(r'\s+', ' ', text)

        text = re.sub(r'<[^>]+>', '', text)

        text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9、，%.。！~？；：""''（）【】\-\s]', '', text)

        punctuation_map = {
            ',,': '，',
            '..': '。',
            '，。': '。',
            ',。': '。',
            '!!': '！',
            '??': '？',
            '；；': '；'
        }

        for old, new in punctuation_map.items():
            text = text.replace(old, new)

        return text.strip()

    def remove_paragraph_duplicates(self, text: str, similarity_threshold: float = 0.85) -> str:
        paragraphs = text.split('。')
        paragraphs = [p.strip() for p in paragraphs if p.strip() and len(p) > 0]

        unique_paragraphs = []

        for paragraph in paragraphs:
            is_similar = False

            for existing in unique_paragraphs:
                similarity = SequenceMatcher(None, paragraph, existing).ratio()

                if similarity > similarity_threshold:
                    is_similar = True
                    if len(paragraph) > len(existing):
                        unique_paragraphs[unique_paragraphs.index(existing)] = paragraph
                    break

            if not is_similar:
                unique_paragraphs.append(paragraph)

        return '。'.join(unique_paragraphs)

    def remove_sentence_duplicates(self, text: str, similarity_threshold: float = 0.9) -> str:
        sentences = re.split(r'[。！？；]', text)
        sentences = [s.strip() for s in sentences if s.strip() and len(s) > 0]

        unique_sentences = []

        for sentence in sentences:
            is_duplicate = False

            for existing in unique_sentences:
                similarity = SequenceMatcher(None, sentence, existing).ratio()

                if similarity > similarity_threshold:
                    is_duplicate = True
                    if len(sentence) > len(existing):
                        unique_sentences[unique_sentences.index(existing)] = sentence
                    break

            if not is_duplicate:
                unique_sentences.append(sentence)

        result = []
        for sentence in unique_sentences:
            if sentence:
                if any(word in sentence for word in ['请', '提醒', '注意', '防止']):
                    result.append(sentence + '。')
                elif '？' in sentence or sentence.endswith('吗') or sentence.endswith('呢'):
                    result.append(sentence + '？')
                elif any(word in sentence for word in ['！', '重要', '紧急', '警告']):
                    result.append(sentence + '！')
                else:
                    result.append(sentence + '。')

        return ''.join(result)

    def remove_phrase_duplicates(self, text: str, min_phrase_length: int = 4, max_phrase_length: int = 20) -> str:
        words = re.findall(r'[\u4e00-\u9fa5a-zA-Z0-9]+', text)

        phrases = []
        for n in range(min_phrase_length, min(max_phrase_length + 1, len(words) + 1)):
            for i in range(len(words) - n + 1):
                phrase = ''.join(words[i:i + n])
                if len(phrase) >= min_phrase_length:
                    phrases.append(phrase)

        phrase_counts = Counter(phrases)

        frequent_phrases = [(phrase, count) for phrase, count in phrase_counts.items()
                            if count >= 3 and len(phrase) >= 6]

        cleaned_text = text

        for phrase, count in sorted(frequent_phrases, key=lambda x: len(x[0]), reverse=True):
            if phrase in cleaned_text:
                first_occurrence = cleaned_text.find(phrase)
                remaining_text = cleaned_text[first_occurrence + len(phrase):]

                removed_count = remaining_text.count(phrase)
                if removed_count > 0:
                    cleaned_text = cleaned_text[:first_occurrence + len(phrase)] + remaining_text.replace(phrase, '')

        return cleaned_text

    def comprehensive_deduplication(self, text: str) -> str:
        # 1. 文本清理
        text = self.clean_text(text)

        # 2. 段落级别去重
        text = self.remove_paragraph_duplicates(text, 0.85)

        # 3. 句子级别去重
        text = self.remove_sentence_duplicates(text, 0.9)

        # 4. 短语级别去重
        text = self.remove_phrase_duplicates(text, 4, 15)

        # 5. 最终标点符号规范化
        punctuation_map = {
            ',,': '，',
            '..': '。',
            '，。': '。',
            ',。': '。',
            '!!': '！',
            '??': '？',
            '；；': '；'
        }

        for old, new in punctuation_map.items():
            text = text.replace(old, new)

        return text

    def process_single_broadcast(self, broadcast_data: Dict[str, Any]) -> Dict[str, Any]:
        broadcast_id = broadcast_data.get('广播ID', 'unknown')
        content = broadcast_data.get('广播内容', '')

        print(f"处理广播ID: {broadcast_id}")

        if not content:
            return {
                'broadcast_id': broadcast_id,
                'original_content': content,
                'deduplicated_content': content,
                'processing_status': 'empty_content'
            }

        try:
            deduplicated_content = self.comprehensive_deduplication(content)

            return {
                'broadcast_id': broadcast_id,
                'original_content': content,
                'deduplicated_content': deduplicated_content,
                'processing_status': 'success'
            }

        except Exception as e:
            print(f"处理广播ID {broadcast_id} 时出错: {str(e)}")
            return {
                'broadcast_id': broadcast_id,
                'original_content': content,
                'deduplicated_content': content,
                'processing_status': 'error'
            }

    def process_broadcast_data(self, input_file: str = 'test.json', output_file: str = 'deduplication_results.json'):
        try:
            # 读取输入文件
            print(f"读取输入文件: {input_file}")
            data = self.safe_read_json(input_file)

            results = []

            # 判断数据类型并处理
            if isinstance(data, dict):
                # 单条广播
                print("检测到单条广播数据")
                result = self.process_single_broadcast(data)
                results.append(result)

            elif isinstance(data, list):
                # 广播数组
                print(f"检测到广播数组，共 {len(data)} 条广播")

                for i, broadcast in enumerate(data, 1):
                    print(f"处理第 {i}/{len(data)} 条广播")
                    result = self.process_single_broadcast(broadcast)
                    results.append(result)

            else:
                raise ValueError("不支持的数据格式，请提供单条广播对象或广播数组")

            simplified_results = []
            successful_count = 0

            for result in results:
                if result['processing_status'] == 'success':
                    simplified_item = {
                        'broadcast_id': result['broadcast_id'],
                        'original_content': result['original_content'],
                        'deduplicated_content': result['deduplicated_content']
                    }
                    simplified_results.append(simplified_item)
                    successful_count += 1

            # 输出处理统计
            print(f"\n处理完成!")
            print(f"总计处理: {len(results)} 条广播")
            print(f"成功处理: {successful_count} 条")
            print(f"处理失败: {len(results) - successful_count} 条")

            # 保存简化结果
            print(f"\n保存简化结果到: {output_file}")
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(simplified_results, f, ensure_ascii=False, indent=2)

            print("处理完成！")
            return simplified_results

        except Exception as e:
            print(f"处理过程中出现错误: {str(e)}")
            raise


def main():
    deduplicator = BroadcastDeduplicator()

    # 检查输入文件是否存在
    input_file = 'test.json'
    if not os.path.exists(input_file):
        print(f"输入文件 {input_file} 不存在!")
        print("请创建包含广播数据的 test.json 文件")
        print("\n支持的格式示例:")
        print("1. 单条广播:")
        print('{"广播内容": "今天天气很好。今天天气很好。", "广播ID": "broadcast_001"}')
        print("\n2. 广播数组:")
        print('[{"广播内容": "第一条...", "广播ID": "001"}, {"广播内容": "第二条...", "广播ID": "002"}]')
        return

    try:
        results = deduplicator.process_broadcast_data(input_file, 'deduplication_results.json')
        print(f"\n简化结果已保存到 deduplication_results.json")
        print(f"成功处理了 {len(results)} 条广播")

    except Exception as e:
        print(f"程序执行失败: {str(e)}")


if __name__ == "__main__":
    main()