You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
310 lines
11 KiB
310 lines
11 KiB
import json |
|
import re |
|
import chardet |
|
from difflib import SequenceMatcher |
|
from collections import Counter |
|
from typing import Union, List, Dict, Any |
|
import os |
|
|
|
|
|
class BroadcastDeduplicator: |
|
"""广播去重处理类""" |
|
|
|
def __init__(self): |
|
pass |
|
|
|
def detect_file_encoding(self, file_path: str) -> str: |
|
with open(file_path, 'rb') as f: |
|
raw_data = f.read(10000) |
|
result = chardet.detect(raw_data) |
|
return result['encoding'] |
|
|
|
def safe_read_json(self, file_path: str) -> Union[Dict, List]: |
|
encodings = ['utf-8', 'gbk', 'gb2312', 'utf-8-sig', 'latin1'] |
|
|
|
try: |
|
detected_encoding = self.detect_file_encoding(file_path) |
|
if detected_encoding: |
|
encodings.insert(0, detected_encoding) |
|
print(f"检测到文件编码: {detected_encoding}") |
|
except: |
|
print("编码检测失败,使用默认编码列表") |
|
|
|
for encoding in encodings: |
|
try: |
|
print(f"尝试使用编码 {encoding} 读取文件...") |
|
with open(file_path, 'r', encoding=encoding) as f: |
|
data = json.load(f) |
|
print(f"成功使用编码 {encoding} 读取文件") |
|
return data |
|
except UnicodeDecodeError: |
|
print(f"编码 {encoding} 失败") |
|
continue |
|
except json.JSONDecodeError as e: |
|
print(f"JSON格式错误: {e}") |
|
raise |
|
except Exception as e: |
|
print(f"使用编码 {encoding} 时出现其他错误: {e}") |
|
continue |
|
|
|
raise Exception(f"无法读取文件 {file_path}") |
|
|
|
def clean_text(self, text: str) -> str: |
|
if not isinstance(text, str): |
|
return str(text) |
|
|
|
text = re.sub(r'\r\n|\r|\n', ' ', text) |
|
text = re.sub(r'\s+', ' ', text) |
|
|
|
text = re.sub(r'<[^>]+>', '', text) |
|
|
|
text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9、,%.。!~?;:""''()【】\-\s]', '', text) |
|
|
|
punctuation_map = { |
|
',,': ',', |
|
'..': '。', |
|
',。': '。', |
|
',。': '。', |
|
'!!': '!', |
|
'??': '?', |
|
';;': ';' |
|
} |
|
|
|
for old, new in punctuation_map.items(): |
|
text = text.replace(old, new) |
|
|
|
return text.strip() |
|
|
|
def remove_paragraph_duplicates(self, text: str, similarity_threshold: float = 0.85) -> str: |
|
paragraphs = text.split('。') |
|
paragraphs = [p.strip() for p in paragraphs if p.strip() and len(p) > 0] |
|
|
|
unique_paragraphs = [] |
|
|
|
for paragraph in paragraphs: |
|
is_similar = False |
|
|
|
for existing in unique_paragraphs: |
|
similarity = SequenceMatcher(None, paragraph, existing).ratio() |
|
|
|
if similarity > similarity_threshold: |
|
is_similar = True |
|
if len(paragraph) > len(existing): |
|
unique_paragraphs[unique_paragraphs.index(existing)] = paragraph |
|
break |
|
|
|
if not is_similar: |
|
unique_paragraphs.append(paragraph) |
|
|
|
return '。'.join(unique_paragraphs) |
|
|
|
def remove_sentence_duplicates(self, text: str, similarity_threshold: float = 0.9) -> str: |
|
sentences = re.split(r'[。!?;]', text) |
|
sentences = [s.strip() for s in sentences if s.strip() and len(s) > 0] |
|
|
|
unique_sentences = [] |
|
|
|
for sentence in sentences: |
|
is_duplicate = False |
|
|
|
for existing in unique_sentences: |
|
similarity = SequenceMatcher(None, sentence, existing).ratio() |
|
|
|
if similarity > similarity_threshold: |
|
is_duplicate = True |
|
if len(sentence) > len(existing): |
|
unique_sentences[unique_sentences.index(existing)] = sentence |
|
break |
|
|
|
if not is_duplicate: |
|
unique_sentences.append(sentence) |
|
|
|
result = [] |
|
for sentence in unique_sentences: |
|
if sentence: |
|
if any(word in sentence for word in ['请', '提醒', '注意', '防止']): |
|
result.append(sentence + '。') |
|
elif '?' in sentence or sentence.endswith('吗') or sentence.endswith('呢'): |
|
result.append(sentence + '?') |
|
elif any(word in sentence for word in ['!', '重要', '紧急', '警告']): |
|
result.append(sentence + '!') |
|
else: |
|
result.append(sentence + '。') |
|
|
|
return ''.join(result) |
|
|
|
def remove_phrase_duplicates(self, text: str, min_phrase_length: int = 4, max_phrase_length: int = 20) -> str: |
|
words = re.findall(r'[\u4e00-\u9fa5a-zA-Z0-9]+', text) |
|
|
|
phrases = [] |
|
for n in range(min_phrase_length, min(max_phrase_length + 1, len(words) + 1)): |
|
for i in range(len(words) - n + 1): |
|
phrase = ''.join(words[i:i + n]) |
|
if len(phrase) >= min_phrase_length: |
|
phrases.append(phrase) |
|
|
|
phrase_counts = Counter(phrases) |
|
|
|
frequent_phrases = [(phrase, count) for phrase, count in phrase_counts.items() |
|
if count >= 3 and len(phrase) >= 6] |
|
|
|
cleaned_text = text |
|
|
|
for phrase, count in sorted(frequent_phrases, key=lambda x: len(x[0]), reverse=True): |
|
if phrase in cleaned_text: |
|
first_occurrence = cleaned_text.find(phrase) |
|
remaining_text = cleaned_text[first_occurrence + len(phrase):] |
|
|
|
removed_count = remaining_text.count(phrase) |
|
if removed_count > 0: |
|
cleaned_text = cleaned_text[:first_occurrence + len(phrase)] + remaining_text.replace(phrase, '') |
|
|
|
return cleaned_text |
|
|
|
def comprehensive_deduplication(self, text: str) -> str: |
|
# 1. 文本清理 |
|
text = self.clean_text(text) |
|
|
|
# 2. 段落级别去重 |
|
text = self.remove_paragraph_duplicates(text, 0.85) |
|
|
|
# 3. 句子级别去重 |
|
text = self.remove_sentence_duplicates(text, 0.9) |
|
|
|
# 4. 短语级别去重 |
|
text = self.remove_phrase_duplicates(text, 4, 15) |
|
|
|
# 5. 最终标点符号规范化 |
|
punctuation_map = { |
|
',,': ',', |
|
'..': '。', |
|
',。': '。', |
|
',。': '。', |
|
'!!': '!', |
|
'??': '?', |
|
';;': ';' |
|
} |
|
|
|
for old, new in punctuation_map.items(): |
|
text = text.replace(old, new) |
|
|
|
return text |
|
|
|
def process_single_broadcast(self, broadcast_data: Dict[str, Any]) -> Dict[str, Any]: |
|
broadcast_id = broadcast_data.get('广播ID', 'unknown') |
|
content = broadcast_data.get('广播内容', '') |
|
|
|
print(f"处理广播ID: {broadcast_id}") |
|
|
|
if not content: |
|
return { |
|
'broadcast_id': broadcast_id, |
|
'original_content': content, |
|
'deduplicated_content': content, |
|
'processing_status': 'empty_content' |
|
} |
|
|
|
try: |
|
deduplicated_content = self.comprehensive_deduplication(content) |
|
|
|
return { |
|
'broadcast_id': broadcast_id, |
|
'original_content': content, |
|
'deduplicated_content': deduplicated_content, |
|
'processing_status': 'success' |
|
} |
|
|
|
except Exception as e: |
|
print(f"处理广播ID {broadcast_id} 时出错: {str(e)}") |
|
return { |
|
'broadcast_id': broadcast_id, |
|
'original_content': content, |
|
'deduplicated_content': content, |
|
'processing_status': 'error' |
|
} |
|
|
|
def process_broadcast_data(self, input_file: str = 'test.json', output_file: str = 'deduplication_results.json'): |
|
try: |
|
# 读取输入文件 |
|
print(f"读取输入文件: {input_file}") |
|
data = self.safe_read_json(input_file) |
|
|
|
results = [] |
|
|
|
# 判断数据类型并处理 |
|
if isinstance(data, dict): |
|
# 单条广播 |
|
print("检测到单条广播数据") |
|
result = self.process_single_broadcast(data) |
|
results.append(result) |
|
|
|
elif isinstance(data, list): |
|
# 广播数组 |
|
print(f"检测到广播数组,共 {len(data)} 条广播") |
|
|
|
for i, broadcast in enumerate(data, 1): |
|
print(f"处理第 {i}/{len(data)} 条广播") |
|
result = self.process_single_broadcast(broadcast) |
|
results.append(result) |
|
|
|
else: |
|
raise ValueError("不支持的数据格式,请提供单条广播对象或广播数组") |
|
|
|
simplified_results = [] |
|
successful_count = 0 |
|
|
|
for result in results: |
|
if result['processing_status'] == 'success': |
|
simplified_item = { |
|
'broadcast_id': result['broadcast_id'], |
|
'original_content': result['original_content'], |
|
'deduplicated_content': result['deduplicated_content'] |
|
} |
|
simplified_results.append(simplified_item) |
|
successful_count += 1 |
|
|
|
# 输出处理统计 |
|
print(f"\n处理完成!") |
|
print(f"总计处理: {len(results)} 条广播") |
|
print(f"成功处理: {successful_count} 条") |
|
print(f"处理失败: {len(results) - successful_count} 条") |
|
|
|
# 保存简化结果 |
|
print(f"\n保存简化结果到: {output_file}") |
|
with open(output_file, 'w', encoding='utf-8') as f: |
|
json.dump(simplified_results, f, ensure_ascii=False, indent=2) |
|
|
|
print("处理完成!") |
|
return simplified_results |
|
|
|
except Exception as e: |
|
print(f"处理过程中出现错误: {str(e)}") |
|
raise |
|
|
|
|
|
def main(): |
|
deduplicator = BroadcastDeduplicator() |
|
|
|
# 检查输入文件是否存在 |
|
input_file = 'test.json' |
|
if not os.path.exists(input_file): |
|
print(f"输入文件 {input_file} 不存在!") |
|
print("请创建包含广播数据的 test.json 文件") |
|
print("\n支持的格式示例:") |
|
print("1. 单条广播:") |
|
print('{"广播内容": "今天天气很好。今天天气很好。", "广播ID": "broadcast_001"}') |
|
print("\n2. 广播数组:") |
|
print('[{"广播内容": "第一条...", "广播ID": "001"}, {"广播内容": "第二条...", "广播ID": "002"}]') |
|
return |
|
|
|
try: |
|
results = deduplicator.process_broadcast_data(input_file, 'deduplication_results.json') |
|
print(f"\n简化结果已保存到 deduplication_results.json") |
|
print(f"成功处理了 {len(results)} 条广播") |
|
|
|
except Exception as e: |
|
print(f"程序执行失败: {str(e)}") |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |