import json import re import chardet from difflib import SequenceMatcher from collections import Counter from typing import Union, List, Dict, Any import os class BroadcastDeduplicator: """广播去重处理类""" def __init__(self): pass def detect_file_encoding(self, file_path: str) -> str: with open(file_path, 'rb') as f: raw_data = f.read(10000) result = chardet.detect(raw_data) return result['encoding'] def safe_read_json(self, file_path: str) -> Union[Dict, List]: encodings = ['utf-8', 'gbk', 'gb2312', 'utf-8-sig', 'latin1'] try: detected_encoding = self.detect_file_encoding(file_path) if detected_encoding: encodings.insert(0, detected_encoding) print(f"检测到文件编码: {detected_encoding}") except: print("编码检测失败,使用默认编码列表") for encoding in encodings: try: print(f"尝试使用编码 {encoding} 读取文件...") with open(file_path, 'r', encoding=encoding) as f: data = json.load(f) print(f"成功使用编码 {encoding} 读取文件") return data except UnicodeDecodeError: print(f"编码 {encoding} 失败") continue except json.JSONDecodeError as e: print(f"JSON格式错误: {e}") raise except Exception as e: print(f"使用编码 {encoding} 时出现其他错误: {e}") continue raise Exception(f"无法读取文件 {file_path}") def clean_text(self, text: str) -> str: if not isinstance(text, str): return str(text) text = re.sub(r'\r\n|\r|\n', ' ', text) text = re.sub(r'\s+', ' ', text) text = re.sub(r'<[^>]+>', '', text) text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9、,%.。!~?;:""''()【】\-\s]', '', text) punctuation_map = { ',,': ',', '..': '。', ',。': '。', ',。': '。', '!!': '!', '??': '?', ';;': ';' } for old, new in punctuation_map.items(): text = text.replace(old, new) return text.strip() def remove_paragraph_duplicates(self, text: str, similarity_threshold: float = 0.85) -> str: paragraphs = text.split('。') paragraphs = [p.strip() for p in paragraphs if p.strip() and len(p) > 0] unique_paragraphs = [] for paragraph in paragraphs: is_similar = False for existing in unique_paragraphs: similarity = SequenceMatcher(None, paragraph, existing).ratio() if similarity > similarity_threshold: is_similar = True if len(paragraph) > len(existing): unique_paragraphs[unique_paragraphs.index(existing)] = paragraph break if not is_similar: unique_paragraphs.append(paragraph) return '。'.join(unique_paragraphs) def remove_sentence_duplicates(self, text: str, similarity_threshold: float = 0.9) -> str: sentences = re.split(r'[。!?;]', text) sentences = [s.strip() for s in sentences if s.strip() and len(s) > 0] unique_sentences = [] for sentence in sentences: is_duplicate = False for existing in unique_sentences: similarity = SequenceMatcher(None, sentence, existing).ratio() if similarity > similarity_threshold: is_duplicate = True if len(sentence) > len(existing): unique_sentences[unique_sentences.index(existing)] = sentence break if not is_duplicate: unique_sentences.append(sentence) result = [] for sentence in unique_sentences: if sentence: if any(word in sentence for word in ['请', '提醒', '注意', '防止']): result.append(sentence + '。') elif '?' in sentence or sentence.endswith('吗') or sentence.endswith('呢'): result.append(sentence + '?') elif any(word in sentence for word in ['!', '重要', '紧急', '警告']): result.append(sentence + '!') else: result.append(sentence + '。') return ''.join(result) def remove_phrase_duplicates(self, text: str, min_phrase_length: int = 4, max_phrase_length: int = 20) -> str: words = re.findall(r'[\u4e00-\u9fa5a-zA-Z0-9]+', text) phrases = [] for n in range(min_phrase_length, min(max_phrase_length + 1, len(words) + 1)): for i in range(len(words) - n + 1): phrase = ''.join(words[i:i + n]) if len(phrase) >= min_phrase_length: phrases.append(phrase) phrase_counts = Counter(phrases) frequent_phrases = [(phrase, count) for phrase, count in phrase_counts.items() if count >= 3 and len(phrase) >= 6] cleaned_text = text for phrase, count in sorted(frequent_phrases, key=lambda x: len(x[0]), reverse=True): if phrase in cleaned_text: first_occurrence = cleaned_text.find(phrase) remaining_text = cleaned_text[first_occurrence + len(phrase):] removed_count = remaining_text.count(phrase) if removed_count > 0: cleaned_text = cleaned_text[:first_occurrence + len(phrase)] + remaining_text.replace(phrase, '') return cleaned_text def comprehensive_deduplication(self, text: str) -> str: # 1. 文本清理 text = self.clean_text(text) # 2. 段落级别去重 text = self.remove_paragraph_duplicates(text, 0.85) # 3. 句子级别去重 text = self.remove_sentence_duplicates(text, 0.9) # 4. 短语级别去重 text = self.remove_phrase_duplicates(text, 4, 15) # 5. 最终标点符号规范化 punctuation_map = { ',,': ',', '..': '。', ',。': '。', ',。': '。', '!!': '!', '??': '?', ';;': ';' } for old, new in punctuation_map.items(): text = text.replace(old, new) return text def process_single_broadcast(self, broadcast_data: Dict[str, Any]) -> Dict[str, Any]: broadcast_id = broadcast_data.get('广播ID', 'unknown') content = broadcast_data.get('广播内容', '') print(f"处理广播ID: {broadcast_id}") if not content: return { 'broadcast_id': broadcast_id, 'original_content': content, 'deduplicated_content': content, 'processing_status': 'empty_content' } try: deduplicated_content = self.comprehensive_deduplication(content) return { 'broadcast_id': broadcast_id, 'original_content': content, 'deduplicated_content': deduplicated_content, 'processing_status': 'success' } except Exception as e: print(f"处理广播ID {broadcast_id} 时出错: {str(e)}") return { 'broadcast_id': broadcast_id, 'original_content': content, 'deduplicated_content': content, 'processing_status': 'error' } def process_broadcast_data(self, input_file: str = 'test.json', output_file: str = 'deduplication_results.json'): try: # 读取输入文件 print(f"读取输入文件: {input_file}") data = self.safe_read_json(input_file) results = [] # 判断数据类型并处理 if isinstance(data, dict): # 单条广播 print("检测到单条广播数据") result = self.process_single_broadcast(data) results.append(result) elif isinstance(data, list): # 广播数组 print(f"检测到广播数组,共 {len(data)} 条广播") for i, broadcast in enumerate(data, 1): print(f"处理第 {i}/{len(data)} 条广播") result = self.process_single_broadcast(broadcast) results.append(result) else: raise ValueError("不支持的数据格式,请提供单条广播对象或广播数组") simplified_results = [] successful_count = 0 for result in results: if result['processing_status'] == 'success': simplified_item = { 'broadcast_id': result['broadcast_id'], 'original_content': result['original_content'], 'deduplicated_content': result['deduplicated_content'] } simplified_results.append(simplified_item) successful_count += 1 # 输出处理统计 print(f"\n处理完成!") print(f"总计处理: {len(results)} 条广播") print(f"成功处理: {successful_count} 条") print(f"处理失败: {len(results) - successful_count} 条") # 保存简化结果 print(f"\n保存简化结果到: {output_file}") with open(output_file, 'w', encoding='utf-8') as f: json.dump(simplified_results, f, ensure_ascii=False, indent=2) print("处理完成!") return simplified_results except Exception as e: print(f"处理过程中出现错误: {str(e)}") raise def main(): deduplicator = BroadcastDeduplicator() # 检查输入文件是否存在 input_file = 'test.json' if not os.path.exists(input_file): print(f"输入文件 {input_file} 不存在!") print("请创建包含广播数据的 test.json 文件") print("\n支持的格式示例:") print("1. 单条广播:") print('{"广播内容": "今天天气很好。今天天气很好。", "广播ID": "broadcast_001"}') print("\n2. 广播数组:") print('[{"广播内容": "第一条...", "广播ID": "001"}, {"广播内容": "第二条...", "广播ID": "002"}]') return try: results = deduplicator.process_broadcast_data(input_file, 'deduplication_results.json') print(f"\n简化结果已保存到 deduplication_results.json") print(f"成功处理了 {len(results)} 条广播") except Exception as e: print(f"程序执行失败: {str(e)}") if __name__ == "__main__": main()