You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

310 lines
11 KiB

import json
import re
import chardet
from difflib import SequenceMatcher
from collections import Counter
from typing import Union, List, Dict, Any
import os
class BroadcastDeduplicator:
"""广播去重处理类"""
def __init__(self):
pass
def detect_file_encoding(self, file_path: str) -> str:
with open(file_path, 'rb') as f:
raw_data = f.read(10000)
result = chardet.detect(raw_data)
return result['encoding']
def safe_read_json(self, file_path: str) -> Union[Dict, List]:
encodings = ['utf-8', 'gbk', 'gb2312', 'utf-8-sig', 'latin1']
try:
detected_encoding = self.detect_file_encoding(file_path)
if detected_encoding:
encodings.insert(0, detected_encoding)
print(f"检测到文件编码: {detected_encoding}")
except:
print("编码检测失败,使用默认编码列表")
for encoding in encodings:
try:
print(f"尝试使用编码 {encoding} 读取文件...")
with open(file_path, 'r', encoding=encoding) as f:
data = json.load(f)
print(f"成功使用编码 {encoding} 读取文件")
return data
except UnicodeDecodeError:
print(f"编码 {encoding} 失败")
continue
except json.JSONDecodeError as e:
print(f"JSON格式错误: {e}")
raise
except Exception as e:
print(f"使用编码 {encoding} 时出现其他错误: {e}")
continue
raise Exception(f"无法读取文件 {file_path}")
def clean_text(self, text: str) -> str:
if not isinstance(text, str):
return str(text)
text = re.sub(r'\r\n|\r|\n', ' ', text)
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'<[^>]+>', '', text)
text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9、,%.。!~?;:""''()【】\-\s]', '', text)
punctuation_map = {
',,': '',
'..': '',
',。': '',
',。': '',
'!!': '',
'??': '',
';;': ''
}
for old, new in punctuation_map.items():
text = text.replace(old, new)
return text.strip()
def remove_paragraph_duplicates(self, text: str, similarity_threshold: float = 0.85) -> str:
paragraphs = text.split('')
paragraphs = [p.strip() for p in paragraphs if p.strip() and len(p) > 0]
unique_paragraphs = []
for paragraph in paragraphs:
is_similar = False
for existing in unique_paragraphs:
similarity = SequenceMatcher(None, paragraph, existing).ratio()
if similarity > similarity_threshold:
is_similar = True
if len(paragraph) > len(existing):
unique_paragraphs[unique_paragraphs.index(existing)] = paragraph
break
if not is_similar:
unique_paragraphs.append(paragraph)
return ''.join(unique_paragraphs)
def remove_sentence_duplicates(self, text: str, similarity_threshold: float = 0.9) -> str:
sentences = re.split(r'[。!?;]', text)
sentences = [s.strip() for s in sentences if s.strip() and len(s) > 0]
unique_sentences = []
for sentence in sentences:
is_duplicate = False
for existing in unique_sentences:
similarity = SequenceMatcher(None, sentence, existing).ratio()
if similarity > similarity_threshold:
is_duplicate = True
if len(sentence) > len(existing):
unique_sentences[unique_sentences.index(existing)] = sentence
break
if not is_duplicate:
unique_sentences.append(sentence)
result = []
for sentence in unique_sentences:
if sentence:
if any(word in sentence for word in ['', '提醒', '注意', '防止']):
result.append(sentence + '')
elif '' in sentence or sentence.endswith('') or sentence.endswith(''):
result.append(sentence + '')
elif any(word in sentence for word in ['', '重要', '紧急', '警告']):
result.append(sentence + '')
else:
result.append(sentence + '')
return ''.join(result)
def remove_phrase_duplicates(self, text: str, min_phrase_length: int = 4, max_phrase_length: int = 20) -> str:
words = re.findall(r'[\u4e00-\u9fa5a-zA-Z0-9]+', text)
phrases = []
for n in range(min_phrase_length, min(max_phrase_length + 1, len(words) + 1)):
for i in range(len(words) - n + 1):
phrase = ''.join(words[i:i + n])
if len(phrase) >= min_phrase_length:
phrases.append(phrase)
phrase_counts = Counter(phrases)
frequent_phrases = [(phrase, count) for phrase, count in phrase_counts.items()
if count >= 3 and len(phrase) >= 6]
cleaned_text = text
for phrase, count in sorted(frequent_phrases, key=lambda x: len(x[0]), reverse=True):
if phrase in cleaned_text:
first_occurrence = cleaned_text.find(phrase)
remaining_text = cleaned_text[first_occurrence + len(phrase):]
removed_count = remaining_text.count(phrase)
if removed_count > 0:
cleaned_text = cleaned_text[:first_occurrence + len(phrase)] + remaining_text.replace(phrase, '')
return cleaned_text
def comprehensive_deduplication(self, text: str) -> str:
# 1. 文本清理
text = self.clean_text(text)
# 2. 段落级别去重
text = self.remove_paragraph_duplicates(text, 0.85)
# 3. 句子级别去重
text = self.remove_sentence_duplicates(text, 0.9)
# 4. 短语级别去重
text = self.remove_phrase_duplicates(text, 4, 15)
# 5. 最终标点符号规范化
punctuation_map = {
',,': '',
'..': '',
',。': '',
',。': '',
'!!': '',
'??': '',
';;': ''
}
for old, new in punctuation_map.items():
text = text.replace(old, new)
return text
def process_single_broadcast(self, broadcast_data: Dict[str, Any]) -> Dict[str, Any]:
broadcast_id = broadcast_data.get('广播ID', 'unknown')
content = broadcast_data.get('广播内容', '')
print(f"处理广播ID: {broadcast_id}")
if not content:
return {
'broadcast_id': broadcast_id,
'original_content': content,
'deduplicated_content': content,
'processing_status': 'empty_content'
}
try:
deduplicated_content = self.comprehensive_deduplication(content)
return {
'broadcast_id': broadcast_id,
'original_content': content,
'deduplicated_content': deduplicated_content,
'processing_status': 'success'
}
except Exception as e:
print(f"处理广播ID {broadcast_id} 时出错: {str(e)}")
return {
'broadcast_id': broadcast_id,
'original_content': content,
'deduplicated_content': content,
'processing_status': 'error'
}
def process_broadcast_data(self, input_file: str = 'test.json', output_file: str = 'deduplication_results.json'):
try:
# 读取输入文件
print(f"读取输入文件: {input_file}")
data = self.safe_read_json(input_file)
results = []
# 判断数据类型并处理
if isinstance(data, dict):
# 单条广播
print("检测到单条广播数据")
result = self.process_single_broadcast(data)
results.append(result)
elif isinstance(data, list):
# 广播数组
print(f"检测到广播数组,共 {len(data)} 条广播")
for i, broadcast in enumerate(data, 1):
print(f"处理第 {i}/{len(data)} 条广播")
result = self.process_single_broadcast(broadcast)
results.append(result)
else:
raise ValueError("不支持的数据格式,请提供单条广播对象或广播数组")
simplified_results = []
successful_count = 0
for result in results:
if result['processing_status'] == 'success':
simplified_item = {
'broadcast_id': result['broadcast_id'],
'original_content': result['original_content'],
'deduplicated_content': result['deduplicated_content']
}
simplified_results.append(simplified_item)
successful_count += 1
# 输出处理统计
print(f"\n处理完成!")
print(f"总计处理: {len(results)} 条广播")
print(f"成功处理: {successful_count}")
print(f"处理失败: {len(results) - successful_count}")
# 保存简化结果
print(f"\n保存简化结果到: {output_file}")
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(simplified_results, f, ensure_ascii=False, indent=2)
print("处理完成!")
return simplified_results
except Exception as e:
print(f"处理过程中出现错误: {str(e)}")
raise
def main():
deduplicator = BroadcastDeduplicator()
# 检查输入文件是否存在
input_file = 'test.json'
if not os.path.exists(input_file):
print(f"输入文件 {input_file} 不存在!")
print("请创建包含广播数据的 test.json 文件")
print("\n支持的格式示例:")
print("1. 单条广播:")
print('{"广播内容": "今天天气很好。今天天气很好。", "广播ID": "broadcast_001"}')
print("\n2. 广播数组:")
print('[{"广播内容": "第一条...", "广播ID": "001"}, {"广播内容": "第二条...", "广播ID": "002"}]')
return
try:
results = deduplicator.process_broadcast_data(input_file, 'deduplication_results.json')
print(f"\n简化结果已保存到 deduplication_results.json")
print(f"成功处理了 {len(results)} 条广播")
except Exception as e:
print(f"程序执行失败: {str(e)}")
if __name__ == "__main__":
main()