commit
6916d406e2
69 changed files with 2456628 additions and 0 deletions
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -0,0 +1,239 @@ |
|||||||
|
import requests |
||||||
|
import json |
||||||
|
import pandas as pd |
||||||
|
import csv |
||||||
|
from typing import List, Dict |
||||||
|
import time |
||||||
|
|
||||||
|
# %% |
||||||
|
# 读取CSV文件 |
||||||
|
csv_file_path = "ai_ai_broadcast_info.csv" |
||||||
|
|
||||||
|
# 尝试不同的编码格式 |
||||||
|
encodings = ['utf-8', 'gbk', 'gb2312', 'utf-8-sig', 'latin-1', 'cp1252'] |
||||||
|
|
||||||
|
df = None |
||||||
|
|
||||||
|
for encoding in encodings: |
||||||
|
try: |
||||||
|
print(f"尝试使用 {encoding} 编码读取文件...") |
||||||
|
df = pd.read_csv(csv_file_path, encoding=encoding) |
||||||
|
print(f"成功使用 {encoding} 编码读取CSV文件,共 {len(df)} 行数据") |
||||||
|
print(f"列名:{list(df.columns)}") |
||||||
|
break |
||||||
|
except UnicodeDecodeError as e: |
||||||
|
print(f" {encoding} 编码失败:{str(e)}") |
||||||
|
continue |
||||||
|
except Exception as e: |
||||||
|
print(f" {encoding} 编码读取时出现其他错误:{str(e)}") |
||||||
|
continue |
||||||
|
|
||||||
|
if df is None: |
||||||
|
print("错误:尝试了所有编码格式都无法读取文件") |
||||||
|
exit() |
||||||
|
|
||||||
|
# 检查是否有id和content列 |
||||||
|
if 'id' not in df.columns: |
||||||
|
print("错误:CSV文件中没有找到'id'列") |
||||||
|
print(f"可用列:{list(df.columns)}") |
||||||
|
exit() |
||||||
|
elif 'content' not in df.columns: |
||||||
|
print("错误:CSV文件中没有找到'content'列") |
||||||
|
print(f"可用列:{list(df.columns)}") |
||||||
|
exit() |
||||||
|
|
||||||
|
print(f"数据加载完成,可用的ID值:{sorted(df['id'].unique())}") |
||||||
|
|
||||||
|
|
||||||
|
# %% |
||||||
|
class SimpleOpenAIHubClient: |
||||||
|
def __init__(self, api_key): |
||||||
|
self.api_key = api_key |
||||||
|
self.base_url = "https://api.openai-hub.com" |
||||||
|
self.headers = { |
||||||
|
"Authorization": f"Bearer {api_key}", |
||||||
|
"Content-Type": "application/json" |
||||||
|
} |
||||||
|
|
||||||
|
def chat(self, prompt, model="gpt-4.1"): |
||||||
|
"""发送prompt并返回模型回答""" |
||||||
|
payload = { |
||||||
|
"model": model, |
||||||
|
"messages": [ |
||||||
|
{ |
||||||
|
"role": "user", |
||||||
|
"content": prompt |
||||||
|
} |
||||||
|
], |
||||||
|
"max_tokens": 100000, |
||||||
|
"temperature": 0.7 |
||||||
|
} |
||||||
|
|
||||||
|
try: |
||||||
|
response = requests.post( |
||||||
|
f"{self.base_url}/v1/chat/completions", |
||||||
|
headers=self.headers, |
||||||
|
json=payload, |
||||||
|
timeout=60 |
||||||
|
) |
||||||
|
|
||||||
|
if response.status_code == 200: |
||||||
|
result = response.json() |
||||||
|
return result['choices'][0]['message']['content'] |
||||||
|
else: |
||||||
|
return f"错误: {response.status_code} - {response.text}" |
||||||
|
except requests.exceptions.RequestException as e: |
||||||
|
return f"请求异常: {str(e)}" |
||||||
|
|
||||||
|
|
||||||
|
print("AI客户端类定义完成!") |
||||||
|
|
||||||
|
# %% |
||||||
|
# 设置API Key |
||||||
|
API_KEY = "sk-XREp2jnIXyZ6UoCnzZeO0ahmLi9OEXuVAtFLojKFpG9gCZ4e" # 请替换为你的实际API Key |
||||||
|
|
||||||
|
# 初始化AI客户端 |
||||||
|
client = SimpleOpenAIHubClient(API_KEY) |
||||||
|
|
||||||
|
print("AI模型加载完成!") |
||||||
|
|
||||||
|
# %% |
||||||
|
prompt_template = """任务:文本合并 |
||||||
|
要求: |
||||||
|
1. 提取JSON数组中所有"text"字段的内容 |
||||||
|
2. 按原始顺序直接拼接 |
||||||
|
3. 不修改任何文字 |
||||||
|
4. 不添加标点符号 |
||||||
|
5. 不做错误纠正 |
||||||
|
6. 只输出合并后的文本 |
||||||
|
|
||||||
|
输入数据: |
||||||
|
<audio_text> %s </audio_text> |
||||||
|
|
||||||
|
输出:""" |
||||||
|
|
||||||
|
print("Prompt模板设置完成!") |
||||||
|
|
||||||
|
# %% |
||||||
|
# 批量处理ID 8-26的数据 |
||||||
|
target_ids = list(range(8, 618)) # 8到26(包含26) |
||||||
|
results = [] |
||||||
|
|
||||||
|
print(f"开始批量处理ID {target_ids[0]} 到 {target_ids[-1]} 的数据...") |
||||||
|
print(f"目标ID列表:{target_ids}") |
||||||
|
|
||||||
|
# 统计可用的ID |
||||||
|
available_ids = df['id'].unique() |
||||||
|
missing_ids = [id for id in target_ids if id not in available_ids] |
||||||
|
processable_ids = [id for id in target_ids if id in available_ids] |
||||||
|
|
||||||
|
if missing_ids: |
||||||
|
print(f"警告:以下ID在数据中不存在:{missing_ids}") |
||||||
|
print(f"将要处理的ID:{processable_ids}") |
||||||
|
|
||||||
|
# 循环处理每个ID |
||||||
|
for current_id in target_ids: |
||||||
|
print(f"\n--- 处理ID {current_id} ---") |
||||||
|
|
||||||
|
# 查找对应ID的行 |
||||||
|
target_row = df[df['id'] == current_id] |
||||||
|
|
||||||
|
if len(target_row) == 0: |
||||||
|
print(f"警告:没有找到id={current_id}的数据行") |
||||||
|
results.append({ |
||||||
|
'id': current_id, |
||||||
|
'original_content': "", |
||||||
|
'merged_text': "", |
||||||
|
'status': "数据不存在" |
||||||
|
}) |
||||||
|
continue |
||||||
|
|
||||||
|
# 获取content内容 |
||||||
|
target_content = target_row['content'].iloc[0] |
||||||
|
content = str(target_content) if pd.notna(target_content) else "" |
||||||
|
|
||||||
|
print(f"Content内容长度:{len(content)}") |
||||||
|
print(f"Content预览:{content[:100]}..." if content else "Content为空") |
||||||
|
|
||||||
|
if not content or content == 'nan': |
||||||
|
print("Content为空,无法处理") |
||||||
|
results.append({ |
||||||
|
'id': current_id, |
||||||
|
'original_content': content, |
||||||
|
'merged_text': "", |
||||||
|
'status': "内容为空" |
||||||
|
}) |
||||||
|
continue |
||||||
|
|
||||||
|
# 构建prompt |
||||||
|
prompt = prompt_template % content |
||||||
|
|
||||||
|
print("正在调用AI模型处理...") |
||||||
|
|
||||||
|
# 调用AI模型 |
||||||
|
try: |
||||||
|
ai_response = client.chat(prompt) |
||||||
|
|
||||||
|
results.append({ |
||||||
|
'id': current_id, |
||||||
|
'original_content': content, |
||||||
|
'merged_text': ai_response, |
||||||
|
'status': "成功" |
||||||
|
}) |
||||||
|
|
||||||
|
print("处理完成!") |
||||||
|
print(f"合并后文本预览:{ai_response[:100]}...") |
||||||
|
|
||||||
|
# 添加延时,避免API调用过于频繁 |
||||||
|
time.sleep(1) |
||||||
|
|
||||||
|
except Exception as e: |
||||||
|
print(f"处理出错:{str(e)}") |
||||||
|
results.append({ |
||||||
|
'id': current_id, |
||||||
|
'original_content': content, |
||||||
|
'merged_text': "", |
||||||
|
'status': f"处理出错: {str(e)}" |
||||||
|
}) |
||||||
|
|
||||||
|
print(f"\n=== 批量处理完成!===") |
||||||
|
print(f"总共尝试处理:{len(target_ids)} 条记录") |
||||||
|
print(f"实际处理完成:{len(results)} 条记录") |
||||||
|
|
||||||
|
# %% |
||||||
|
# 生成处理结果报告 |
||||||
|
result_df = pd.DataFrame(results) |
||||||
|
|
||||||
|
# 显示处理结果统计 |
||||||
|
print("\n处理结果统计:") |
||||||
|
status_counts = result_df['status'].value_counts() |
||||||
|
print(status_counts) |
||||||
|
|
||||||
|
# 显示各个状态的ID |
||||||
|
for status in status_counts.index: |
||||||
|
status_ids = result_df[result_df['status'] == status]['id'].tolist() |
||||||
|
print(f"{status}的ID:{status_ids}") |
||||||
|
|
||||||
|
# 显示结果预览 |
||||||
|
print("\n结果预览(前5行):") |
||||||
|
preview_df = result_df[['id', 'status', 'merged_text']].copy() |
||||||
|
preview_df['merged_text_preview'] = preview_df['merged_text'].apply( |
||||||
|
lambda x: str(x)[:100] + "..." if len(str(x)) > 100 else str(x) |
||||||
|
) |
||||||
|
print(preview_df[['id', 'status', 'merged_text_preview']].head()) |
||||||
|
|
||||||
|
# 保存到CSV文件 |
||||||
|
output_file = "merged_results_all.csv" |
||||||
|
result_df.to_csv(output_file, index=False, encoding='utf-8-sig') |
||||||
|
|
||||||
|
print(f"\n结果已保存到文件:{output_file}") |
||||||
|
|
||||||
|
# 成功处理的统计 |
||||||
|
successful_results = [r for r in results if r['status'] == '成功'] |
||||||
|
print(f"\n最终统计:") |
||||||
|
print(f"成功处理:{len(successful_results)} 条记录") |
||||||
|
print(f"失败处理:{len(results) - len(successful_results)} 条记录") |
||||||
|
|
||||||
|
if successful_results: |
||||||
|
successful_ids = [r['id'] for r in successful_results] |
||||||
|
print(f"成功处理的ID:{successful_ids}") |
||||||
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,181 @@ |
|||||||
|
import json |
||||||
|
import re |
||||||
|
import os |
||||||
|
from datetime import datetime |
||||||
|
|
||||||
|
|
||||||
|
def time_to_milliseconds(time_str): |
||||||
|
"""将时间字符串转换为毫秒""" |
||||||
|
# 解析时间格式 HH:MM:SS |
||||||
|
parts = time_str.split(':') |
||||||
|
hours = int(parts[0]) |
||||||
|
minutes = int(parts[1]) |
||||||
|
seconds = int(parts[2]) |
||||||
|
|
||||||
|
# 转换为毫秒 |
||||||
|
total_ms = (hours * 3600 + minutes * 60 + seconds) * 1000 |
||||||
|
return total_ms |
||||||
|
|
||||||
|
|
||||||
|
def parse_timeline_file(file_path, fixed_id=1104): |
||||||
|
"""解析时间轴文本文件""" |
||||||
|
result = [] |
||||||
|
|
||||||
|
try: |
||||||
|
with open(file_path, 'r', encoding='utf-8') as file: |
||||||
|
content = file.read().strip() |
||||||
|
|
||||||
|
# 按行分割内容 |
||||||
|
lines = content.split('\n') |
||||||
|
|
||||||
|
i = 0 |
||||||
|
while i < len(lines): |
||||||
|
line = lines[i].strip() |
||||||
|
|
||||||
|
# 检查是否是时间轴格式:HH:MM:SS-HH:MM:SS |
||||||
|
time_match = re.match(r'(\d{2}:\d{2}:\d{2})-(\d{2}:\d{2}:\d{2})', line) |
||||||
|
|
||||||
|
if time_match: |
||||||
|
start_time_str = time_match.group(1) |
||||||
|
end_time_str = time_match.group(2) |
||||||
|
|
||||||
|
start_time_ms = time_to_milliseconds(start_time_str) |
||||||
|
end_time_ms = time_to_milliseconds(end_time_str) |
||||||
|
|
||||||
|
# 获取下一行作为内容(如果存在) |
||||||
|
content_text = "" |
||||||
|
if i + 1 < len(lines): |
||||||
|
content_text = lines[i + 1].strip() |
||||||
|
i += 1 # 跳过内容行 |
||||||
|
|
||||||
|
# 创建JSON对象 |
||||||
|
if content_text: # 只有当内容不为空时才添加 |
||||||
|
json_obj = { |
||||||
|
"d_id": fixed_id, |
||||||
|
"start_time": start_time_ms, |
||||||
|
"end_time": end_time_ms, |
||||||
|
"content": content_text |
||||||
|
} |
||||||
|
result.append(json_obj) |
||||||
|
|
||||||
|
i += 1 |
||||||
|
|
||||||
|
except FileNotFoundError: |
||||||
|
print(f"文件未找到: {file_path}") |
||||||
|
return [] |
||||||
|
except Exception as e: |
||||||
|
print(f"处理文件时出错: {e}") |
||||||
|
return [] |
||||||
|
|
||||||
|
return result |
||||||
|
|
||||||
|
|
||||||
|
def get_txt_files(folder_path): |
||||||
|
"""获取文件夹中所有的txt文件""" |
||||||
|
txt_files = [] |
||||||
|
try: |
||||||
|
for filename in os.listdir(folder_path): |
||||||
|
if filename.lower().endswith('.txt'): |
||||||
|
full_path = os.path.join(folder_path, filename) |
||||||
|
txt_files.append((filename, full_path)) |
||||||
|
|
||||||
|
# 按文件名排序,确保处理顺序一致 |
||||||
|
txt_files.sort(key=lambda x: x[0]) |
||||||
|
return txt_files |
||||||
|
except Exception as e: |
||||||
|
print(f"读取文件夹时出错: {e}") |
||||||
|
return [] |
||||||
|
|
||||||
|
|
||||||
|
def save_to_json(data, output_path): |
||||||
|
"""保存为JSON文件""" |
||||||
|
try: |
||||||
|
with open(output_path, 'w', encoding='utf-8') as file: |
||||||
|
json.dump(data, file, ensure_ascii=False, indent=2) |
||||||
|
print(f"JSON文件已保存: {output_path}") |
||||||
|
except Exception as e: |
||||||
|
print(f"保存JSON文件时出错: {e}") |
||||||
|
|
||||||
|
|
||||||
|
def batch_process_txt_files(folder_path, start_id=1104): |
||||||
|
"""批量处理文件夹中的txt文件""" |
||||||
|
txt_files = get_txt_files(folder_path) |
||||||
|
|
||||||
|
if not txt_files: |
||||||
|
print("未找到任何txt文件") |
||||||
|
return |
||||||
|
|
||||||
|
print(f"找到 {len(txt_files)} 个txt文件:") |
||||||
|
for i, (filename, _) in enumerate(txt_files): |
||||||
|
print(f"{i + 1}. {filename} (d_id: {start_id + i})") |
||||||
|
|
||||||
|
all_data = [] |
||||||
|
file_summary = [] |
||||||
|
|
||||||
|
for i, (filename, file_path) in enumerate(txt_files): |
||||||
|
current_id = start_id + i |
||||||
|
print(f"\n正在处理: {filename} (d_id: {current_id})") |
||||||
|
|
||||||
|
# 解析单个文件 |
||||||
|
file_data = parse_timeline_file(file_path, current_id) |
||||||
|
|
||||||
|
if file_data: |
||||||
|
all_data.extend(file_data) |
||||||
|
file_summary.append({ |
||||||
|
"filename": filename, |
||||||
|
"d_id": current_id, |
||||||
|
"segments": len(file_data) |
||||||
|
}) |
||||||
|
print(f"成功解析 {len(file_data)} 个数据段") |
||||||
|
else: |
||||||
|
print(f"文件 {filename} 未能解析到有效数据") |
||||||
|
|
||||||
|
# 保存合并的JSON文件 |
||||||
|
if all_data: |
||||||
|
output_file = os.path.join(folder_path, "all_timeline_data.json") |
||||||
|
save_to_json(all_data, output_file) |
||||||
|
|
||||||
|
# 保存处理摘要 |
||||||
|
summary_file = os.path.join(folder_path, "processing_summary.json") |
||||||
|
summary_data = { |
||||||
|
"total_files": len(txt_files), |
||||||
|
"total_segments": len(all_data), |
||||||
|
"start_id": start_id, |
||||||
|
"end_id": start_id + len(txt_files) - 1, |
||||||
|
"files": file_summary |
||||||
|
} |
||||||
|
save_to_json(summary_data, summary_file) |
||||||
|
|
||||||
|
print(f"\n=== 处理完成 ===") |
||||||
|
print(f"总文件数: {len(txt_files)}") |
||||||
|
print(f"总数据段: {len(all_data)}") |
||||||
|
print(f"ID范围: {start_id} - {start_id + len(txt_files) - 1}") |
||||||
|
print(f"合并文件: all_timeline_data.json") |
||||||
|
print(f"摘要文件: processing_summary.json") |
||||||
|
|
||||||
|
# # 分别保存每个文件的JSON |
||||||
|
# print(f"\n正在保存单独的JSON文件...") |
||||||
|
# for i, (filename, file_path) in enumerate(txt_files): |
||||||
|
# current_id = start_id + i |
||||||
|
# file_data = parse_timeline_file(file_path, current_id) |
||||||
|
# if file_data: |
||||||
|
# json_filename = filename.replace('.txt', '.json') |
||||||
|
# json_path = os.path.join(folder_path, json_filename) |
||||||
|
# save_to_json(file_data, json_path) |
||||||
|
# |
||||||
|
# print("所有单独的JSON文件已保存完成") |
||||||
|
|
||||||
|
else: |
||||||
|
print("没有解析到任何有效数据") |
||||||
|
|
||||||
|
|
||||||
|
def main(): |
||||||
|
# 批量处理文件夹中的txt文件 |
||||||
|
folder_path = r"D:\workstation\voice-txt\ct-punc-test\ASR+punc\staic-应急宣传" |
||||||
|
start_id = 1104 |
||||||
|
|
||||||
|
print("开始批量处理txt文件...") |
||||||
|
batch_process_txt_files(folder_path, start_id) |
||||||
|
|
||||||
|
if __name__ == "__main__": |
||||||
|
main() |
||||||
File diff suppressed because one or more lines are too long
@ -0,0 +1,206 @@ |
|||||||
|
import json |
||||||
|
import requests |
||||||
|
import csv |
||||||
|
from collections import defaultdict |
||||||
|
|
||||||
|
|
||||||
|
class SimpleOpenAIHubClient: |
||||||
|
def __init__(self, api_key): |
||||||
|
self.api_key = api_key |
||||||
|
self.base_url = "https://api.openai-hub.com" |
||||||
|
self.headers = { |
||||||
|
"Authorization": f"Bearer {api_key}", |
||||||
|
"Content-Type": "application/json" |
||||||
|
} |
||||||
|
|
||||||
|
def chat(self, prompt, model="gpt-4.1"): |
||||||
|
"""发送prompt并返回模型回答""" |
||||||
|
payload = { |
||||||
|
"model": model, |
||||||
|
"messages": [ |
||||||
|
{ |
||||||
|
"role": "user", |
||||||
|
"content": prompt |
||||||
|
} |
||||||
|
], |
||||||
|
"max_tokens": 100000, |
||||||
|
"temperature": 0.7 |
||||||
|
} |
||||||
|
try: |
||||||
|
response = requests.post( |
||||||
|
f"{self.base_url}/v1/chat/completions", |
||||||
|
headers=self.headers, |
||||||
|
json=payload, |
||||||
|
timeout=60 |
||||||
|
) |
||||||
|
if response.status_code == 200: |
||||||
|
result = response.json() |
||||||
|
return result['choices'][0]['message']['content'] |
||||||
|
else: |
||||||
|
return f"错误: {response.status_code} - {response.text}" |
||||||
|
except requests.exceptions.RequestException as e: |
||||||
|
return f"请求异常: {str(e)}" |
||||||
|
|
||||||
|
|
||||||
|
def load_json_data(file_path): |
||||||
|
"""加载JSON数据""" |
||||||
|
try: |
||||||
|
with open(file_path, 'r', encoding='utf-8') as file: |
||||||
|
data = json.load(file) |
||||||
|
print(f"成功加载数据,共 {len(data)} 条记录") |
||||||
|
return data |
||||||
|
except Exception as e: |
||||||
|
print(f"加载JSON文件时出错: {e}") |
||||||
|
return [] |
||||||
|
|
||||||
|
|
||||||
|
def group_by_id(data): |
||||||
|
"""按d_id分组数据""" |
||||||
|
grouped = defaultdict(list) |
||||||
|
for item in data: |
||||||
|
d_id = item.get('d_id') |
||||||
|
content = item.get('content', '') |
||||||
|
if d_id is not None and content: |
||||||
|
grouped[d_id].append({ |
||||||
|
'start_time': item.get('start_time'), |
||||||
|
'end_time': item.get('end_time'), |
||||||
|
'content': content |
||||||
|
}) |
||||||
|
|
||||||
|
# 对每个组内的数据按start_time排序 |
||||||
|
for d_id in grouped: |
||||||
|
grouped[d_id].sort(key=lambda x: x['start_time']) |
||||||
|
|
||||||
|
return grouped |
||||||
|
|
||||||
|
|
||||||
|
def create_prompt_for_group(group_data): |
||||||
|
"""为每个组创建AI prompt""" |
||||||
|
# 构建类似JSON的文本数组格式 |
||||||
|
text_array = [] |
||||||
|
for i, item in enumerate(group_data): |
||||||
|
# 转义双引号 |
||||||
|
escaped_content = item["content"].replace('"', '""') |
||||||
|
text_array.append(f'{{"text": "{escaped_content}"}}') |
||||||
|
|
||||||
|
json_like_text = "[" + ", ".join(text_array) + "]" |
||||||
|
|
||||||
|
prompt_template = """任务:文本合并 |
||||||
|
要求: |
||||||
|
1. 提取JSON数组中所有"text"字段的内容 |
||||||
|
2. 按原始顺序直接拼接 |
||||||
|
3. 不修改任何文字 |
||||||
|
4. 不添加标点符号 |
||||||
|
5. 不做错误纠正 |
||||||
|
6. 只输出合并后的文本 |
||||||
|
|
||||||
|
输入数据: |
||||||
|
<audio_text> %s </audio_text> |
||||||
|
|
||||||
|
输出:""" |
||||||
|
|
||||||
|
return prompt_template % json_like_text |
||||||
|
|
||||||
|
|
||||||
|
def merge_texts_directly(grouped_data): |
||||||
|
"""直接按时间顺序合并文本,不使用AI模型""" |
||||||
|
merged_results = {} |
||||||
|
total_groups = len(grouped_data) |
||||||
|
|
||||||
|
print(f"开始处理 {total_groups} 个组...") |
||||||
|
|
||||||
|
for i, (d_id, group_data) in enumerate(grouped_data.items(), 1): |
||||||
|
print(f"处理组 {i}/{total_groups}: d_id={d_id}, 包含{len(group_data)}个片段") |
||||||
|
|
||||||
|
# 直接拼接所有文本内容 |
||||||
|
merged_text = "" |
||||||
|
for item in group_data: |
||||||
|
merged_text += item['content'] |
||||||
|
|
||||||
|
# 存储结果 |
||||||
|
merged_results[d_id] = { |
||||||
|
'original_content': json.dumps( |
||||||
|
[{"end": item['end_time'], "start": item['start_time'], "text": item['content']} for item in |
||||||
|
group_data], ensure_ascii=False), |
||||||
|
'merged_text': merged_text, |
||||||
|
'segments_count': len(group_data) |
||||||
|
} |
||||||
|
|
||||||
|
print(f"完成: d_id={d_id}") |
||||||
|
print(f"合并结果预览: {merged_text[:100]}...") |
||||||
|
print("-" * 50) |
||||||
|
|
||||||
|
return merged_results |
||||||
|
|
||||||
|
|
||||||
|
def save_to_csv(merged_results, output_path): |
||||||
|
"""保存为CSV文件""" |
||||||
|
try: |
||||||
|
with open(output_path, 'w', newline='', encoding='utf-8') as csvfile: |
||||||
|
fieldnames = ['id', 'original_content', 'merged_text', 'status'] |
||||||
|
writer = csv.DictWriter(csvfile, fieldnames=fieldnames) |
||||||
|
|
||||||
|
# 写入表头 |
||||||
|
writer.writeheader() |
||||||
|
|
||||||
|
# 按id排序并写入数据 |
||||||
|
for d_id in sorted(merged_results.keys()): |
||||||
|
result = merged_results[d_id] |
||||||
|
writer.writerow({ |
||||||
|
'id': d_id, |
||||||
|
'original_content': result['original_content'], |
||||||
|
'merged_text': result['merged_text'], |
||||||
|
'status': '成功' |
||||||
|
}) |
||||||
|
|
||||||
|
print(f"CSV文件已保存到: {output_path}") |
||||||
|
return True |
||||||
|
except Exception as e: |
||||||
|
print(f"保存CSV文件时出错: {e}") |
||||||
|
return False |
||||||
|
|
||||||
|
|
||||||
|
def main(): |
||||||
|
# 输入文件路径 |
||||||
|
input_file = r"D:\workstation\Data\广播\all_data619-1103.json" |
||||||
|
|
||||||
|
# 输出文件路径 |
||||||
|
output_file = r"D:\workstation\Data\广播\merged_texts619-1103.csv" |
||||||
|
|
||||||
|
# 加载数据 |
||||||
|
print("加载JSON数据...") |
||||||
|
data = load_json_data(input_file) |
||||||
|
|
||||||
|
if not data: |
||||||
|
print("无数据可处理") |
||||||
|
return |
||||||
|
|
||||||
|
# 按d_id分组 |
||||||
|
print("按d_id分组数据...") |
||||||
|
grouped_data = group_by_id(data) |
||||||
|
print(f"共分为 {len(grouped_data)} 个组") |
||||||
|
|
||||||
|
# 显示分组统计 |
||||||
|
print("\n分组统计:") |
||||||
|
for d_id, group_data in list(grouped_data.items())[:5]: # 显示前5个组的信息 |
||||||
|
print(f"d_id {d_id}: {len(group_data)} 个片段") |
||||||
|
if len(grouped_data) > 5: |
||||||
|
print(f"... 还有 {len(grouped_data) - 5} 个组") |
||||||
|
|
||||||
|
# 直接合并文本(不使用AI) |
||||||
|
print("\n开始直接文本合并...") |
||||||
|
merged_results = merge_texts_directly(grouped_data) |
||||||
|
|
||||||
|
# 保存结果 |
||||||
|
print("\n保存合并结果...") |
||||||
|
if save_to_csv(merged_results, output_file): |
||||||
|
print("✅ 所有任务完成!") |
||||||
|
print(f"📁 输入文件: {input_file}") |
||||||
|
print(f"📄 输出文件: {output_file}") |
||||||
|
print(f"📊 处理统计: {len(grouped_data)} 个组,{len(data)} 个原始片段") |
||||||
|
else: |
||||||
|
print("❌ 保存结果失败") |
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__": |
||||||
|
main() |
||||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -0,0 +1,408 @@ |
|||||||
|
import pandas as pd |
||||||
|
import re |
||||||
|
from difflib import SequenceMatcher |
||||||
|
from collections import Counter |
||||||
|
import chardet |
||||||
|
|
||||||
|
|
||||||
|
def detect_file_encoding(file_path): |
||||||
|
"""检测文件编码""" |
||||||
|
with open(file_path, 'rb') as f: |
||||||
|
raw_data = f.read(10000) # 读取前10KB来检测编码 |
||||||
|
result = chardet.detect(raw_data) |
||||||
|
return result['encoding'] |
||||||
|
|
||||||
|
|
||||||
|
def safe_read_csv(file_path): |
||||||
|
"""安全读取CSV文件,自动检测编码""" |
||||||
|
# 尝试多种编码方式 |
||||||
|
encodings = ['utf-8', 'gbk', 'gb2312', 'utf-8-sig', 'latin1'] |
||||||
|
|
||||||
|
# 首先尝试自动检测编码 |
||||||
|
try: |
||||||
|
detected_encoding = detect_file_encoding(file_path) |
||||||
|
if detected_encoding: |
||||||
|
encodings.insert(0, detected_encoding) |
||||||
|
print(f"检测到文件编码: {detected_encoding}") |
||||||
|
except: |
||||||
|
print("编码检测失败,使用默认编码列表") |
||||||
|
|
||||||
|
# 尝试不同编码读取文件 |
||||||
|
for encoding in encodings: |
||||||
|
try: |
||||||
|
print(f"尝试使用编码 {encoding} 读取文件...") |
||||||
|
df = pd.read_csv(file_path, encoding=encoding) |
||||||
|
print(f"成功使用编码 {encoding} 读取文件") |
||||||
|
return df |
||||||
|
except UnicodeDecodeError: |
||||||
|
print(f"编码 {encoding} 失败") |
||||||
|
continue |
||||||
|
except Exception as e: |
||||||
|
print(f"使用编码 {encoding} 时出现其他错误: {e}") |
||||||
|
continue |
||||||
|
|
||||||
|
# 如果所有编码都失败,尝试忽略错误的方式 |
||||||
|
try: |
||||||
|
print("尝试使用 utf-8 编码并忽略错误...") |
||||||
|
df = pd.read_csv(file_path, encoding='utf-8', errors='ignore') |
||||||
|
print("成功读取文件(忽略了一些字符)") |
||||||
|
return df |
||||||
|
except Exception as e: |
||||||
|
raise Exception(f"无法读取文件 {file_path}: {e}") |
||||||
|
|
||||||
|
|
||||||
|
def clean_text(text): |
||||||
|
# 统一换行符和空格 |
||||||
|
text = re.sub(r'\r\n|\r|\n', ' ', text) |
||||||
|
text = re.sub(r'\s+', ' ', text) # 多个空格合并为一个 |
||||||
|
|
||||||
|
# 去除HTML标签(如果存在) |
||||||
|
text = re.sub(r'<[^>]+>', '', text) |
||||||
|
|
||||||
|
# 【修改点】保留中文、英文、数字、标点符号 (增加了顿号 `、`) |
||||||
|
text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9、,%.。!?;:""''()【】\-\s]', '', text) |
||||||
|
|
||||||
|
# 标点符号规范化 (初次) |
||||||
|
punctuation_map = { |
||||||
|
',,': ',', |
||||||
|
'..': '。', |
||||||
|
',。': '。', |
||||||
|
',。': '。', |
||||||
|
'!!': '!', |
||||||
|
'??': '?', |
||||||
|
';;': ';' |
||||||
|
} |
||||||
|
|
||||||
|
for old, new in punctuation_map.items(): |
||||||
|
text = text.replace(old, new) |
||||||
|
|
||||||
|
return text.strip() |
||||||
|
|
||||||
|
|
||||||
|
def remove_paragraph_duplicates(text, similarity_threshold=0.85): |
||||||
|
""" |
||||||
|
段落级别去重:基于相似度去除重复段落 |
||||||
|
""" |
||||||
|
paragraphs = text.split('。') |
||||||
|
paragraphs = [p.strip() for p in paragraphs if p.strip() and len(p) > 0] |
||||||
|
|
||||||
|
unique_paragraphs = [] |
||||||
|
removed_paragraphs = [] |
||||||
|
|
||||||
|
for paragraph in paragraphs: |
||||||
|
is_similar = False |
||||||
|
|
||||||
|
for existing in unique_paragraphs: |
||||||
|
similarity = SequenceMatcher(None, paragraph, existing).ratio() |
||||||
|
|
||||||
|
if similarity > similarity_threshold: |
||||||
|
is_similar = True |
||||||
|
if len(paragraph) > len(existing): |
||||||
|
removed_paragraphs.append(f"段落替换: {existing[:50]}...") |
||||||
|
unique_paragraphs[unique_paragraphs.index(existing)] = paragraph |
||||||
|
else: |
||||||
|
removed_paragraphs.append(f"段落重复: {paragraph[:50]}...") |
||||||
|
break |
||||||
|
|
||||||
|
if not is_similar: |
||||||
|
unique_paragraphs.append(paragraph) |
||||||
|
|
||||||
|
return '。'.join(unique_paragraphs), removed_paragraphs |
||||||
|
|
||||||
|
|
||||||
|
def remove_sentence_duplicates(text, similarity_threshold=0.9): |
||||||
|
""" |
||||||
|
句子级别去重:去除重复的句子 |
||||||
|
""" |
||||||
|
# 句子切分时,也可以考虑加入顿号,但这可能会切分得过细,这里暂时不修改 |
||||||
|
sentences = re.split(r'[。!?;]', text) |
||||||
|
sentences = [s.strip() for s in sentences if s.strip() and len(s) > 0] |
||||||
|
|
||||||
|
unique_sentences = [] |
||||||
|
removed_sentences = [] |
||||||
|
|
||||||
|
for sentence in sentences: |
||||||
|
is_duplicate = False |
||||||
|
|
||||||
|
for existing in unique_sentences: |
||||||
|
similarity = SequenceMatcher(None, sentence, existing).ratio() |
||||||
|
|
||||||
|
if similarity > similarity_threshold: |
||||||
|
is_duplicate = True |
||||||
|
if len(sentence) > len(existing): |
||||||
|
removed_sentences.append(f"句子替换: {existing[:30]}...") |
||||||
|
unique_sentences[unique_sentences.index(existing)] = sentence |
||||||
|
else: |
||||||
|
removed_sentences.append(f"句子重复: {sentence[:30]}...") |
||||||
|
break |
||||||
|
|
||||||
|
if not is_duplicate: |
||||||
|
unique_sentences.append(sentence) |
||||||
|
|
||||||
|
result = [] |
||||||
|
for sentence in unique_sentences: |
||||||
|
if sentence: |
||||||
|
if any(word in sentence for word in ['请', '提醒', '注意', '防止']): |
||||||
|
result.append(sentence + '。') |
||||||
|
elif '?' in sentence or sentence.endswith('吗') or sentence.endswith('呢'): |
||||||
|
result.append(sentence + '?') |
||||||
|
elif any(word in sentence for word in ['!', '重要', '紧急', '警告']): |
||||||
|
result.append(sentence + '!') |
||||||
|
else: |
||||||
|
result.append(sentence + '。') |
||||||
|
|
||||||
|
return ''.join(result), removed_sentences |
||||||
|
|
||||||
|
|
||||||
|
def remove_phrase_duplicates(text, min_phrase_length=4, max_phrase_length=20): |
||||||
|
""" |
||||||
|
短语级别去重:去除重复的短语和词组 |
||||||
|
""" |
||||||
|
words = re.findall(r'[\u4e00-\u9fa5a-zA-Z0-9]+', text) |
||||||
|
|
||||||
|
phrases = [] |
||||||
|
for n in range(min_phrase_length, min(max_phrase_length + 1, len(words) + 1)): |
||||||
|
for i in range(len(words) - n + 1): |
||||||
|
phrase = ''.join(words[i:i + n]) |
||||||
|
if len(phrase) >= min_phrase_length: |
||||||
|
phrases.append(phrase) |
||||||
|
|
||||||
|
phrase_counts = Counter(phrases) |
||||||
|
|
||||||
|
frequent_phrases = [(phrase, count) for phrase, count in phrase_counts.items() |
||||||
|
if count >= 3 and len(phrase) >= 6] |
||||||
|
|
||||||
|
cleaned_text = text |
||||||
|
removed_phrases = [] |
||||||
|
|
||||||
|
for phrase, count in sorted(frequent_phrases, key=lambda x: len(x[0]), reverse=True): |
||||||
|
if phrase in cleaned_text: |
||||||
|
first_occurrence = cleaned_text.find(phrase) |
||||||
|
remaining_text = cleaned_text[first_occurrence + len(phrase):] |
||||||
|
|
||||||
|
removed_count = remaining_text.count(phrase) |
||||||
|
if removed_count > 0: |
||||||
|
cleaned_text = cleaned_text[:first_occurrence + len(phrase)] + remaining_text.replace(phrase, '') |
||||||
|
removed_phrases.append(f"短语重复({removed_count}次): {phrase}") |
||||||
|
|
||||||
|
return cleaned_text, removed_phrases |
||||||
|
|
||||||
|
|
||||||
|
def comprehensive_deduplication(text): |
||||||
|
""" |
||||||
|
综合去重:按层级顺序进行多级别去重,并在最后进行标点规范化 |
||||||
|
""" |
||||||
|
original_length = len(text) |
||||||
|
|
||||||
|
# 1. 段落级别去重 |
||||||
|
print("1. 执行段落级别去重...") |
||||||
|
text, paragraph_removed = remove_paragraph_duplicates(text, 0.85) |
||||||
|
paragraph_length = len(text) |
||||||
|
print(f" 段落去重后长度: {paragraph_length} (减少 {original_length - paragraph_length} 字符)") |
||||||
|
|
||||||
|
# 2. 句子级别去重 |
||||||
|
print("2. 执行句子级别去重...") |
||||||
|
text, sentence_removed = remove_sentence_duplicates(text, 0.9) |
||||||
|
sentence_length = len(text) |
||||||
|
print(f" 句子去重后长度: {sentence_length} (减少 {paragraph_length - sentence_length} 字符)") |
||||||
|
|
||||||
|
# 3. 短语级别去重 |
||||||
|
print("3. 执行短语级别去重...") |
||||||
|
text, phrase_removed = remove_phrase_duplicates(text, 4, 15) |
||||||
|
phrase_length = len(text) |
||||||
|
print(f" 短语去重后长度: {phrase_length} (减少 {sentence_length - phrase_length} 字符)") |
||||||
|
|
||||||
|
# 4. 最终标点符号规范化 |
||||||
|
print("4. 执行最终标点符号规范化...") |
||||||
|
punctuation_map = { |
||||||
|
',,': ',', |
||||||
|
'..': '。', |
||||||
|
',。': '。', |
||||||
|
',。': '。', |
||||||
|
'!!': '!', |
||||||
|
'??': '?', |
||||||
|
';;': ';' |
||||||
|
} |
||||||
|
|
||||||
|
final_text = text |
||||||
|
for old, new in punctuation_map.items(): |
||||||
|
final_text = final_text.replace(old, new) |
||||||
|
|
||||||
|
final_length = len(final_text) |
||||||
|
print(f" 最终规范化后长度: {final_length} (减少 {phrase_length - final_length} 字符)") |
||||||
|
|
||||||
|
# 生成详细报告 |
||||||
|
report = { |
||||||
|
'original_length': original_length, |
||||||
|
'after_paragraph': paragraph_length, |
||||||
|
'after_sentence': sentence_length, |
||||||
|
'after_phrase': phrase_length, |
||||||
|
'final_length': final_length, |
||||||
|
'total_reduction': original_length - final_length, |
||||||
|
'reduction_ratio': (original_length - final_length) / original_length if original_length > 0 else 0, |
||||||
|
'removed_items': { |
||||||
|
'paragraphs': paragraph_removed, |
||||||
|
'sentences': sentence_removed, |
||||||
|
'phrases': phrase_removed |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
return final_text, report |
||||||
|
|
||||||
|
|
||||||
|
# 主处理流程 |
||||||
|
def main(): |
||||||
|
print("开始多级别去重处理...\n") |
||||||
|
|
||||||
|
# 读取CSV文件 |
||||||
|
try: |
||||||
|
df = safe_read_csv('merged.csv') |
||||||
|
except Exception as e: |
||||||
|
print(f"读取CSV文件失败: {e}") |
||||||
|
return |
||||||
|
|
||||||
|
print(f"读取到CSV文件,共 {len(df)} 行数据") |
||||||
|
print(f"CSV文件列名: {list(df.columns)}") |
||||||
|
|
||||||
|
if 'id' in df.columns: |
||||||
|
print(f"可用的ID列表: {sorted(df['id'].unique())}") |
||||||
|
else: |
||||||
|
print("警告:CSV文件中没有找到'id'列") |
||||||
|
print("请检查CSV文件格式") |
||||||
|
return |
||||||
|
|
||||||
|
# 准备结果列表 |
||||||
|
all_results = [] |
||||||
|
all_reports = [] |
||||||
|
|
||||||
|
# 遍历所有ID |
||||||
|
for current_id in sorted(df['id'].unique()): |
||||||
|
print(f"\n{'=' * 50}") |
||||||
|
print(f"处理ID: {current_id}") |
||||||
|
print(f"{'=' * 50}") |
||||||
|
|
||||||
|
target_row = df[df['id'] == current_id] |
||||||
|
|
||||||
|
if len(target_row) == 0: |
||||||
|
print(f"警告:没有找到ID={current_id}的数据") |
||||||
|
continue |
||||||
|
|
||||||
|
if 'merged_text' not in target_row.columns: |
||||||
|
print(f"错误:找不到merged_text列") |
||||||
|
continue |
||||||
|
|
||||||
|
original_text = target_row['merged_text'].iloc[0] |
||||||
|
|
||||||
|
if pd.isna(original_text) or str(original_text).strip() == '': |
||||||
|
print(f"警告:ID={current_id}的merged_text为空,跳过处理") |
||||||
|
all_results.append({ |
||||||
|
'id': current_id, 'original_text': '', 'cleaned_text': '', 'final_processed_text': '', |
||||||
|
'original_length': 0, 'cleaned_length': 0, 'final_length': 0, 'paragraph_reduction': 0, |
||||||
|
'sentence_reduction': 0, 'phrase_reduction': 0, 'punctuation_reduction': 0, |
||||||
|
'total_reduction': 0, 'reduction_ratio': 0 |
||||||
|
}) |
||||||
|
continue |
||||||
|
|
||||||
|
print(f"原始文本长度: {len(original_text)} 字符") |
||||||
|
|
||||||
|
try: |
||||||
|
print("执行基础文本清洗...") |
||||||
|
cleaned_text = clean_text(str(original_text)) |
||||||
|
print(f"清洗后文本长度: {len(cleaned_text)} 字符") |
||||||
|
|
||||||
|
final_text, dedup_report = comprehensive_deduplication(cleaned_text) |
||||||
|
|
||||||
|
print(f"处理完成") |
||||||
|
print(f"总体压缩比: {dedup_report['reduction_ratio']:.2%}") |
||||||
|
print(f"最终文本长度: {dedup_report['final_length']} 字符") |
||||||
|
|
||||||
|
result_record = { |
||||||
|
'id': current_id, |
||||||
|
'original_text': original_text, |
||||||
|
'cleaned_text': cleaned_text, |
||||||
|
'final_processed_text': final_text, |
||||||
|
'original_length': len(str(original_text)), |
||||||
|
'cleaned_length': len(cleaned_text), |
||||||
|
'final_length': len(final_text), |
||||||
|
'paragraph_reduction': dedup_report['original_length'] - dedup_report['after_paragraph'], |
||||||
|
'sentence_reduction': dedup_report['after_paragraph'] - dedup_report['after_sentence'], |
||||||
|
'phrase_reduction': dedup_report['after_sentence'] - dedup_report['after_phrase'], |
||||||
|
'punctuation_reduction': dedup_report['after_phrase'] - dedup_report['final_length'], |
||||||
|
'total_reduction': dedup_report['total_reduction'], |
||||||
|
'reduction_ratio': dedup_report['reduction_ratio'] |
||||||
|
} |
||||||
|
|
||||||
|
all_results.append(result_record) |
||||||
|
all_reports.append((current_id, dedup_report)) |
||||||
|
|
||||||
|
except Exception as e: |
||||||
|
print(f"处理ID={current_id}时出错: {str(e)}") |
||||||
|
all_results.append({ |
||||||
|
'id': current_id, 'original_text': str(original_text), 'cleaned_text': '', 'final_processed_text': '', |
||||||
|
'original_length': len(str(original_text)), 'cleaned_length': 0, 'final_length': 0, |
||||||
|
'paragraph_reduction': 0, 'sentence_reduction': 0, 'phrase_reduction': 0, 'punctuation_reduction': 0, |
||||||
|
'total_reduction': 0, 'reduction_ratio': 0 |
||||||
|
}) |
||||||
|
|
||||||
|
print(f"\n{'=' * 60}") |
||||||
|
print("所有ID处理完成!") |
||||||
|
print(f"{'=' * 60}") |
||||||
|
|
||||||
|
result_df = pd.DataFrame(all_results) |
||||||
|
|
||||||
|
print(f"总共处理: {len(all_results)} 个ID") |
||||||
|
print(f"成功处理: {len([r for r in all_results if r['final_length'] > 0])} 个ID") |
||||||
|
print(f"处理失败或跳过: {len([r for r in all_results if r['final_length'] == 0])} 个ID") |
||||||
|
|
||||||
|
if len(all_results) > 0: |
||||||
|
avg_reduction = result_df['reduction_ratio'].mean() |
||||||
|
print(f"平均压缩比: {avg_reduction:.2%}") |
||||||
|
print(f"总原始字符数: {result_df['original_length'].sum()}") |
||||||
|
print(f"总最终字符数: {result_df['final_length'].sum()}") |
||||||
|
|
||||||
|
try: |
||||||
|
result_df.to_csv('batch_deduplication_results_619-1103_01.csv', index=False, encoding='utf-8-sig') |
||||||
|
print("结果已保存到: batch_deduplication_results_619-1103_01.csv") |
||||||
|
except Exception as e: |
||||||
|
print(f"保存结果CSV时出错: {e}") |
||||||
|
|
||||||
|
try: |
||||||
|
with open('batch_deduplication_report_619-1103_01.txt', 'w', encoding='utf-8') as f: |
||||||
|
f.write("=== 批量多级别去重详细报告 ===\n\n") |
||||||
|
f.write(f"处理日期: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}\n") |
||||||
|
f.write(f"总共处理: {len(all_results)} 个ID\n") |
||||||
|
f.write(f"成功处理: {len([r for r in all_results if r['final_length'] > 0])} 个ID\n\n") |
||||||
|
|
||||||
|
if len(all_results) > 0: |
||||||
|
f.write("总体统计:\n") |
||||||
|
f.write(f"- 平均压缩比: {result_df['reduction_ratio'].mean():.2%}\n") |
||||||
|
f.write(f"- 总原始字符数: {result_df['original_length'].sum():,}\n") |
||||||
|
f.write(f"- 总最终字符数: {result_df['final_length'].sum():,}\n") |
||||||
|
f.write(f"- 总减少字符数: {result_df['total_reduction'].sum():,}\n\n") |
||||||
|
|
||||||
|
for id_num, report in all_reports: |
||||||
|
f.write(f"\n--- ID {id_num} 详细报告 ---\n") |
||||||
|
f.write(f"原始文本长度: {report['original_length']} 字符\n") |
||||||
|
f.write(f"最终文本长度: {report['final_length']} 字符\n") |
||||||
|
f.write(f"总体压缩比: {report['reduction_ratio']:.2%}\n") |
||||||
|
|
||||||
|
f.write("各级别处理效果:\n") |
||||||
|
f.write(f"1. 段落级去重: 减少 {report['original_length'] - report['after_paragraph']} 字符\n") |
||||||
|
f.write(f"2. 句子级去重: 减少 {report['after_paragraph'] - report['after_sentence']} 字符\n") |
||||||
|
f.write(f"3. 短语级去重: 减少 {report['after_sentence'] - report['after_phrase']} 字符\n") |
||||||
|
f.write(f"4. 最终标点规范化: 减少 {report['after_phrase'] - report['final_length']} 字符\n") |
||||||
|
|
||||||
|
for level, items in report['removed_items'].items(): |
||||||
|
if items: |
||||||
|
f.write(f"{level.upper()}级别移除了 {len(items)} 项内容\n") |
||||||
|
|
||||||
|
print("详细报告已保存到: batch_deduplication_report_619-1103.txt") |
||||||
|
except Exception as e: |
||||||
|
print(f"保存报告时出错: {e}") |
||||||
|
|
||||||
|
print(f"\n结果预览:") |
||||||
|
print(result_df[['id', 'original_length', 'final_length', 'reduction_ratio']].head(10)) |
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__": |
||||||
|
main() |
||||||
File diff suppressed because one or more lines are too long
@ -0,0 +1,310 @@ |
|||||||
|
import json |
||||||
|
import re |
||||||
|
import chardet |
||||||
|
from difflib import SequenceMatcher |
||||||
|
from collections import Counter |
||||||
|
from typing import Union, List, Dict, Any |
||||||
|
import os |
||||||
|
|
||||||
|
|
||||||
|
class BroadcastDeduplicator: |
||||||
|
"""广播去重处理类""" |
||||||
|
|
||||||
|
def __init__(self): |
||||||
|
pass |
||||||
|
|
||||||
|
def detect_file_encoding(self, file_path: str) -> str: |
||||||
|
with open(file_path, 'rb') as f: |
||||||
|
raw_data = f.read(10000) |
||||||
|
result = chardet.detect(raw_data) |
||||||
|
return result['encoding'] |
||||||
|
|
||||||
|
def safe_read_json(self, file_path: str) -> Union[Dict, List]: |
||||||
|
encodings = ['utf-8', 'gbk', 'gb2312', 'utf-8-sig', 'latin1'] |
||||||
|
|
||||||
|
try: |
||||||
|
detected_encoding = self.detect_file_encoding(file_path) |
||||||
|
if detected_encoding: |
||||||
|
encodings.insert(0, detected_encoding) |
||||||
|
print(f"检测到文件编码: {detected_encoding}") |
||||||
|
except: |
||||||
|
print("编码检测失败,使用默认编码列表") |
||||||
|
|
||||||
|
for encoding in encodings: |
||||||
|
try: |
||||||
|
print(f"尝试使用编码 {encoding} 读取文件...") |
||||||
|
with open(file_path, 'r', encoding=encoding) as f: |
||||||
|
data = json.load(f) |
||||||
|
print(f"成功使用编码 {encoding} 读取文件") |
||||||
|
return data |
||||||
|
except UnicodeDecodeError: |
||||||
|
print(f"编码 {encoding} 失败") |
||||||
|
continue |
||||||
|
except json.JSONDecodeError as e: |
||||||
|
print(f"JSON格式错误: {e}") |
||||||
|
raise |
||||||
|
except Exception as e: |
||||||
|
print(f"使用编码 {encoding} 时出现其他错误: {e}") |
||||||
|
continue |
||||||
|
|
||||||
|
raise Exception(f"无法读取文件 {file_path}") |
||||||
|
|
||||||
|
def clean_text(self, text: str) -> str: |
||||||
|
if not isinstance(text, str): |
||||||
|
return str(text) |
||||||
|
|
||||||
|
text = re.sub(r'\r\n|\r|\n', ' ', text) |
||||||
|
text = re.sub(r'\s+', ' ', text) |
||||||
|
|
||||||
|
text = re.sub(r'<[^>]+>', '', text) |
||||||
|
|
||||||
|
text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9、,%.。!~?;:""''()【】\-\s]', '', text) |
||||||
|
|
||||||
|
punctuation_map = { |
||||||
|
',,': ',', |
||||||
|
'..': '。', |
||||||
|
',。': '。', |
||||||
|
',。': '。', |
||||||
|
'!!': '!', |
||||||
|
'??': '?', |
||||||
|
';;': ';' |
||||||
|
} |
||||||
|
|
||||||
|
for old, new in punctuation_map.items(): |
||||||
|
text = text.replace(old, new) |
||||||
|
|
||||||
|
return text.strip() |
||||||
|
|
||||||
|
def remove_paragraph_duplicates(self, text: str, similarity_threshold: float = 0.85) -> str: |
||||||
|
paragraphs = text.split('。') |
||||||
|
paragraphs = [p.strip() for p in paragraphs if p.strip() and len(p) > 0] |
||||||
|
|
||||||
|
unique_paragraphs = [] |
||||||
|
|
||||||
|
for paragraph in paragraphs: |
||||||
|
is_similar = False |
||||||
|
|
||||||
|
for existing in unique_paragraphs: |
||||||
|
similarity = SequenceMatcher(None, paragraph, existing).ratio() |
||||||
|
|
||||||
|
if similarity > similarity_threshold: |
||||||
|
is_similar = True |
||||||
|
if len(paragraph) > len(existing): |
||||||
|
unique_paragraphs[unique_paragraphs.index(existing)] = paragraph |
||||||
|
break |
||||||
|
|
||||||
|
if not is_similar: |
||||||
|
unique_paragraphs.append(paragraph) |
||||||
|
|
||||||
|
return '。'.join(unique_paragraphs) |
||||||
|
|
||||||
|
def remove_sentence_duplicates(self, text: str, similarity_threshold: float = 0.9) -> str: |
||||||
|
sentences = re.split(r'[。!?;]', text) |
||||||
|
sentences = [s.strip() for s in sentences if s.strip() and len(s) > 0] |
||||||
|
|
||||||
|
unique_sentences = [] |
||||||
|
|
||||||
|
for sentence in sentences: |
||||||
|
is_duplicate = False |
||||||
|
|
||||||
|
for existing in unique_sentences: |
||||||
|
similarity = SequenceMatcher(None, sentence, existing).ratio() |
||||||
|
|
||||||
|
if similarity > similarity_threshold: |
||||||
|
is_duplicate = True |
||||||
|
if len(sentence) > len(existing): |
||||||
|
unique_sentences[unique_sentences.index(existing)] = sentence |
||||||
|
break |
||||||
|
|
||||||
|
if not is_duplicate: |
||||||
|
unique_sentences.append(sentence) |
||||||
|
|
||||||
|
result = [] |
||||||
|
for sentence in unique_sentences: |
||||||
|
if sentence: |
||||||
|
if any(word in sentence for word in ['请', '提醒', '注意', '防止']): |
||||||
|
result.append(sentence + '。') |
||||||
|
elif '?' in sentence or sentence.endswith('吗') or sentence.endswith('呢'): |
||||||
|
result.append(sentence + '?') |
||||||
|
elif any(word in sentence for word in ['!', '重要', '紧急', '警告']): |
||||||
|
result.append(sentence + '!') |
||||||
|
else: |
||||||
|
result.append(sentence + '。') |
||||||
|
|
||||||
|
return ''.join(result) |
||||||
|
|
||||||
|
def remove_phrase_duplicates(self, text: str, min_phrase_length: int = 4, max_phrase_length: int = 20) -> str: |
||||||
|
words = re.findall(r'[\u4e00-\u9fa5a-zA-Z0-9]+', text) |
||||||
|
|
||||||
|
phrases = [] |
||||||
|
for n in range(min_phrase_length, min(max_phrase_length + 1, len(words) + 1)): |
||||||
|
for i in range(len(words) - n + 1): |
||||||
|
phrase = ''.join(words[i:i + n]) |
||||||
|
if len(phrase) >= min_phrase_length: |
||||||
|
phrases.append(phrase) |
||||||
|
|
||||||
|
phrase_counts = Counter(phrases) |
||||||
|
|
||||||
|
frequent_phrases = [(phrase, count) for phrase, count in phrase_counts.items() |
||||||
|
if count >= 3 and len(phrase) >= 6] |
||||||
|
|
||||||
|
cleaned_text = text |
||||||
|
|
||||||
|
for phrase, count in sorted(frequent_phrases, key=lambda x: len(x[0]), reverse=True): |
||||||
|
if phrase in cleaned_text: |
||||||
|
first_occurrence = cleaned_text.find(phrase) |
||||||
|
remaining_text = cleaned_text[first_occurrence + len(phrase):] |
||||||
|
|
||||||
|
removed_count = remaining_text.count(phrase) |
||||||
|
if removed_count > 0: |
||||||
|
cleaned_text = cleaned_text[:first_occurrence + len(phrase)] + remaining_text.replace(phrase, '') |
||||||
|
|
||||||
|
return cleaned_text |
||||||
|
|
||||||
|
def comprehensive_deduplication(self, text: str) -> str: |
||||||
|
# 1. 文本清理 |
||||||
|
text = self.clean_text(text) |
||||||
|
|
||||||
|
# 2. 段落级别去重 |
||||||
|
text = self.remove_paragraph_duplicates(text, 0.85) |
||||||
|
|
||||||
|
# 3. 句子级别去重 |
||||||
|
text = self.remove_sentence_duplicates(text, 0.9) |
||||||
|
|
||||||
|
# 4. 短语级别去重 |
||||||
|
text = self.remove_phrase_duplicates(text, 4, 15) |
||||||
|
|
||||||
|
# 5. 最终标点符号规范化 |
||||||
|
punctuation_map = { |
||||||
|
',,': ',', |
||||||
|
'..': '。', |
||||||
|
',。': '。', |
||||||
|
',。': '。', |
||||||
|
'!!': '!', |
||||||
|
'??': '?', |
||||||
|
';;': ';' |
||||||
|
} |
||||||
|
|
||||||
|
for old, new in punctuation_map.items(): |
||||||
|
text = text.replace(old, new) |
||||||
|
|
||||||
|
return text |
||||||
|
|
||||||
|
def process_single_broadcast(self, broadcast_data: Dict[str, Any]) -> Dict[str, Any]: |
||||||
|
broadcast_id = broadcast_data.get('广播ID', 'unknown') |
||||||
|
content = broadcast_data.get('广播内容', '') |
||||||
|
|
||||||
|
print(f"处理广播ID: {broadcast_id}") |
||||||
|
|
||||||
|
if not content: |
||||||
|
return { |
||||||
|
'broadcast_id': broadcast_id, |
||||||
|
'original_content': content, |
||||||
|
'deduplicated_content': content, |
||||||
|
'processing_status': 'empty_content' |
||||||
|
} |
||||||
|
|
||||||
|
try: |
||||||
|
deduplicated_content = self.comprehensive_deduplication(content) |
||||||
|
|
||||||
|
return { |
||||||
|
'broadcast_id': broadcast_id, |
||||||
|
'original_content': content, |
||||||
|
'deduplicated_content': deduplicated_content, |
||||||
|
'processing_status': 'success' |
||||||
|
} |
||||||
|
|
||||||
|
except Exception as e: |
||||||
|
print(f"处理广播ID {broadcast_id} 时出错: {str(e)}") |
||||||
|
return { |
||||||
|
'broadcast_id': broadcast_id, |
||||||
|
'original_content': content, |
||||||
|
'deduplicated_content': content, |
||||||
|
'processing_status': 'error' |
||||||
|
} |
||||||
|
|
||||||
|
def process_broadcast_data(self, input_file: str = 'test.json', output_file: str = 'deduplication_results.json'): |
||||||
|
try: |
||||||
|
# 读取输入文件 |
||||||
|
print(f"读取输入文件: {input_file}") |
||||||
|
data = self.safe_read_json(input_file) |
||||||
|
|
||||||
|
results = [] |
||||||
|
|
||||||
|
# 判断数据类型并处理 |
||||||
|
if isinstance(data, dict): |
||||||
|
# 单条广播 |
||||||
|
print("检测到单条广播数据") |
||||||
|
result = self.process_single_broadcast(data) |
||||||
|
results.append(result) |
||||||
|
|
||||||
|
elif isinstance(data, list): |
||||||
|
# 广播数组 |
||||||
|
print(f"检测到广播数组,共 {len(data)} 条广播") |
||||||
|
|
||||||
|
for i, broadcast in enumerate(data, 1): |
||||||
|
print(f"处理第 {i}/{len(data)} 条广播") |
||||||
|
result = self.process_single_broadcast(broadcast) |
||||||
|
results.append(result) |
||||||
|
|
||||||
|
else: |
||||||
|
raise ValueError("不支持的数据格式,请提供单条广播对象或广播数组") |
||||||
|
|
||||||
|
simplified_results = [] |
||||||
|
successful_count = 0 |
||||||
|
|
||||||
|
for result in results: |
||||||
|
if result['processing_status'] == 'success': |
||||||
|
simplified_item = { |
||||||
|
'broadcast_id': result['broadcast_id'], |
||||||
|
'original_content': result['original_content'], |
||||||
|
'deduplicated_content': result['deduplicated_content'] |
||||||
|
} |
||||||
|
simplified_results.append(simplified_item) |
||||||
|
successful_count += 1 |
||||||
|
|
||||||
|
# 输出处理统计 |
||||||
|
print(f"\n处理完成!") |
||||||
|
print(f"总计处理: {len(results)} 条广播") |
||||||
|
print(f"成功处理: {successful_count} 条") |
||||||
|
print(f"处理失败: {len(results) - successful_count} 条") |
||||||
|
|
||||||
|
# 保存简化结果 |
||||||
|
print(f"\n保存简化结果到: {output_file}") |
||||||
|
with open(output_file, 'w', encoding='utf-8') as f: |
||||||
|
json.dump(simplified_results, f, ensure_ascii=False, indent=2) |
||||||
|
|
||||||
|
print("处理完成!") |
||||||
|
return simplified_results |
||||||
|
|
||||||
|
except Exception as e: |
||||||
|
print(f"处理过程中出现错误: {str(e)}") |
||||||
|
raise |
||||||
|
|
||||||
|
|
||||||
|
def main(): |
||||||
|
deduplicator = BroadcastDeduplicator() |
||||||
|
|
||||||
|
# 检查输入文件是否存在 |
||||||
|
input_file = 'test.json' |
||||||
|
if not os.path.exists(input_file): |
||||||
|
print(f"输入文件 {input_file} 不存在!") |
||||||
|
print("请创建包含广播数据的 test.json 文件") |
||||||
|
print("\n支持的格式示例:") |
||||||
|
print("1. 单条广播:") |
||||||
|
print('{"广播内容": "今天天气很好。今天天气很好。", "广播ID": "broadcast_001"}') |
||||||
|
print("\n2. 广播数组:") |
||||||
|
print('[{"广播内容": "第一条...", "广播ID": "001"}, {"广播内容": "第二条...", "广播ID": "002"}]') |
||||||
|
return |
||||||
|
|
||||||
|
try: |
||||||
|
results = deduplicator.process_broadcast_data(input_file, 'deduplication_results.json') |
||||||
|
print(f"\n简化结果已保存到 deduplication_results.json") |
||||||
|
print(f"成功处理了 {len(results)} 条广播") |
||||||
|
|
||||||
|
except Exception as e: |
||||||
|
print(f"程序执行失败: {str(e)}") |
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__": |
||||||
|
main() |
||||||
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,542 @@ |
|||||||
|
=== 批量多级别去重详细报告 === |
||||||
|
|
||||||
|
处理日期: 2025-08-11 17:04:28 |
||||||
|
总共处理: 50 个ID |
||||||
|
成功处理: 50 个ID |
||||||
|
|
||||||
|
总体统计: |
||||||
|
- 平均压缩比: 24.59% |
||||||
|
- 总原始字符数: 108,025 |
||||||
|
- 总最终字符数: 57,951 |
||||||
|
- 总减少字符数: 50,038 |
||||||
|
|
||||||
|
|
||||||
|
--- ID 1104 详细报告 --- |
||||||
|
原始文本长度: 791 字符 |
||||||
|
最终文本长度: 791 字符 |
||||||
|
总体压缩比: 0.00% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 1 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
|
||||||
|
--- ID 1105 详细报告 --- |
||||||
|
原始文本长度: 791 字符 |
||||||
|
最终文本长度: 791 字符 |
||||||
|
总体压缩比: 0.00% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 1 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
|
||||||
|
--- ID 1106 详细报告 --- |
||||||
|
原始文本长度: 7591 字符 |
||||||
|
最终文本长度: 801 字符 |
||||||
|
总体压缩比: 89.45% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 6791 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 161 项内容 |
||||||
|
|
||||||
|
--- ID 1107 详细报告 --- |
||||||
|
原始文本长度: 19 字符 |
||||||
|
最终文本长度: 19 字符 |
||||||
|
总体压缩比: 0.00% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 1 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
|
||||||
|
--- ID 1108 详细报告 --- |
||||||
|
原始文本长度: 3738 字符 |
||||||
|
最终文本长度: 1248 字符 |
||||||
|
总体压缩比: 66.61% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 2491 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 60 项内容 |
||||||
|
|
||||||
|
--- ID 1109 详细报告 --- |
||||||
|
原始文本长度: 4841 字符 |
||||||
|
最终文本长度: 4841 字符 |
||||||
|
总体压缩比: 0.00% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 1 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
|
||||||
|
--- ID 1110 详细报告 --- |
||||||
|
原始文本长度: 177 字符 |
||||||
|
最终文本长度: 104 字符 |
||||||
|
总体压缩比: 41.24% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 74 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 1 项内容 |
||||||
|
|
||||||
|
--- ID 1111 详细报告 --- |
||||||
|
原始文本长度: 212 字符 |
||||||
|
最终文本长度: 212 字符 |
||||||
|
总体压缩比: 0.00% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 1 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
|
||||||
|
--- ID 1112 详细报告 --- |
||||||
|
原始文本长度: 190 字符 |
||||||
|
最终文本长度: 116 字符 |
||||||
|
总体压缩比: 38.95% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 75 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 1 项内容 |
||||||
|
|
||||||
|
--- ID 1113 详细报告 --- |
||||||
|
原始文本长度: 1282 字符 |
||||||
|
最终文本长度: 1282 字符 |
||||||
|
总体压缩比: 0.00% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 1 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
|
||||||
|
--- ID 1114 详细报告 --- |
||||||
|
原始文本长度: 5262 字符 |
||||||
|
最终文本长度: 5262 字符 |
||||||
|
总体压缩比: 0.00% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 1 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
|
||||||
|
--- ID 1115 详细报告 --- |
||||||
|
原始文本长度: 5328 字符 |
||||||
|
最终文本长度: 2005 字符 |
||||||
|
总体压缩比: 62.37% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 2707 字符 |
||||||
|
2. 句子级去重: 减少 616 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 33 项内容 |
||||||
|
SENTENCES级别移除了 7 项内容 |
||||||
|
|
||||||
|
--- ID 1116 详细报告 --- |
||||||
|
原始文本长度: 5127 字符 |
||||||
|
最终文本长度: 5117 字符 |
||||||
|
总体压缩比: 0.20% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 11 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 1 项内容 |
||||||
|
|
||||||
|
--- ID 1117 详细报告 --- |
||||||
|
原始文本长度: 400 字符 |
||||||
|
最终文本长度: 400 字符 |
||||||
|
总体压缩比: 0.00% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 1 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
|
||||||
|
--- ID 1118 详细报告 --- |
||||||
|
原始文本长度: 1296 字符 |
||||||
|
最终文本长度: 817 字符 |
||||||
|
总体压缩比: 36.96% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 480 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 11 项内容 |
||||||
|
|
||||||
|
--- ID 1119 详细报告 --- |
||||||
|
原始文本长度: 445 字符 |
||||||
|
最终文本长度: 284 字符 |
||||||
|
总体压缩比: 36.18% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 162 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 2 项内容 |
||||||
|
|
||||||
|
--- ID 1120 详细报告 --- |
||||||
|
原始文本长度: 795 字符 |
||||||
|
最终文本长度: 422 字符 |
||||||
|
总体压缩比: 46.92% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 374 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 10 项内容 |
||||||
|
|
||||||
|
--- ID 1121 详细报告 --- |
||||||
|
原始文本长度: 796 字符 |
||||||
|
最终文本长度: 424 字符 |
||||||
|
总体压缩比: 46.73% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 373 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 10 项内容 |
||||||
|
|
||||||
|
--- ID 1122 详细报告 --- |
||||||
|
原始文本长度: 125 字符 |
||||||
|
最终文本长度: 125 字符 |
||||||
|
总体压缩比: 0.00% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 1 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
|
||||||
|
--- ID 1123 详细报告 --- |
||||||
|
原始文本长度: 37 字符 |
||||||
|
最终文本长度: 37 字符 |
||||||
|
总体压缩比: 0.00% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 1 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
|
||||||
|
--- ID 1124 详细报告 --- |
||||||
|
原始文本长度: 3675 字符 |
||||||
|
最终文本长度: 3175 字符 |
||||||
|
总体压缩比: 13.61% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 501 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 14 项内容 |
||||||
|
|
||||||
|
--- ID 1125 详细报告 --- |
||||||
|
原始文本长度: 498 字符 |
||||||
|
最终文本长度: 249 字符 |
||||||
|
总体压缩比: 50.00% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 250 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 1 项内容 |
||||||
|
|
||||||
|
--- ID 1126 详细报告 --- |
||||||
|
原始文本长度: 2461 字符 |
||||||
|
最终文本长度: 486 字符 |
||||||
|
总体压缩比: 80.25% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 1976 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 40 项内容 |
||||||
|
|
||||||
|
--- ID 1127 详细报告 --- |
||||||
|
原始文本长度: 2442 字符 |
||||||
|
最终文本长度: 1120 字符 |
||||||
|
总体压缩比: 54.14% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 1323 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 15 项内容 |
||||||
|
|
||||||
|
--- ID 1128 详细报告 --- |
||||||
|
原始文本长度: 2560 字符 |
||||||
|
最终文本长度: 1779 字符 |
||||||
|
总体压缩比: 30.51% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 782 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 8 项内容 |
||||||
|
|
||||||
|
--- ID 1129 详细报告 --- |
||||||
|
原始文本长度: 2561 字符 |
||||||
|
最终文本长度: 1788 字符 |
||||||
|
总体压缩比: 30.18% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 774 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 7 项内容 |
||||||
|
|
||||||
|
--- ID 1130 详细报告 --- |
||||||
|
原始文本长度: 673 字符 |
||||||
|
最终文本长度: 673 字符 |
||||||
|
总体压缩比: 0.00% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 1 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
|
||||||
|
--- ID 1131 详细报告 --- |
||||||
|
原始文本长度: 264 字符 |
||||||
|
最终文本长度: 264 字符 |
||||||
|
总体压缩比: 0.00% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 1 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
|
||||||
|
--- ID 1132 详细报告 --- |
||||||
|
原始文本长度: 1566 字符 |
||||||
|
最终文本长度: 1442 字符 |
||||||
|
总体压缩比: 7.92% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 125 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 2 项内容 |
||||||
|
|
||||||
|
--- ID 1133 详细报告 --- |
||||||
|
原始文本长度: 1559 字符 |
||||||
|
最终文本长度: 1559 字符 |
||||||
|
总体压缩比: 0.00% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 1 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
|
||||||
|
--- ID 1134 详细报告 --- |
||||||
|
原始文本长度: 2510 字符 |
||||||
|
最终文本长度: 356 字符 |
||||||
|
总体压缩比: 85.82% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 2155 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 141 项内容 |
||||||
|
|
||||||
|
--- ID 1135 详细报告 --- |
||||||
|
原始文本长度: 2530 字符 |
||||||
|
最终文本长度: 380 字符 |
||||||
|
总体压缩比: 84.98% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 2151 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 158 项内容 |
||||||
|
|
||||||
|
--- ID 1136 详细报告 --- |
||||||
|
原始文本长度: 251 字符 |
||||||
|
最终文本长度: 251 字符 |
||||||
|
总体压缩比: 0.00% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 1 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
|
||||||
|
--- ID 1137 详细报告 --- |
||||||
|
原始文本长度: 3153 字符 |
||||||
|
最终文本长度: 571 字符 |
||||||
|
总体压缩比: 81.89% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 2583 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 147 项内容 |
||||||
|
|
||||||
|
--- ID 1138 详细报告 --- |
||||||
|
原始文本长度: 917 字符 |
||||||
|
最终文本长度: 883 字符 |
||||||
|
总体压缩比: 3.71% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 35 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 1 项内容 |
||||||
|
|
||||||
|
--- ID 1139 详细报告 --- |
||||||
|
原始文本长度: 908 字符 |
||||||
|
最终文本长度: 857 字符 |
||||||
|
总体压缩比: 5.62% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 52 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 1 项内容 |
||||||
|
|
||||||
|
--- ID 1140 详细报告 --- |
||||||
|
原始文本长度: 2797 字符 |
||||||
|
最终文本长度: 1656 字符 |
||||||
|
总体压缩比: 40.79% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 1142 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 25 项内容 |
||||||
|
|
||||||
|
--- ID 1141 详细报告 --- |
||||||
|
原始文本长度: 800 字符 |
||||||
|
最终文本长度: 800 字符 |
||||||
|
总体压缩比: 0.00% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 1 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
|
||||||
|
--- ID 1142 详细报告 --- |
||||||
|
原始文本长度: 618 字符 |
||||||
|
最终文本长度: 598 字符 |
||||||
|
总体压缩比: 3.24% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 21 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 1 项内容 |
||||||
|
|
||||||
|
--- ID 1143 详细报告 --- |
||||||
|
原始文本长度: 1330 字符 |
||||||
|
最终文本长度: 732 字符 |
||||||
|
总体压缩比: 44.96% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 599 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 11 项内容 |
||||||
|
|
||||||
|
--- ID 1144 详细报告 --- |
||||||
|
原始文本长度: 22010 字符 |
||||||
|
最终文本长度: 1494 字符 |
||||||
|
总体压缩比: 93.21% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 20517 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 342 项内容 |
||||||
|
|
||||||
|
--- ID 1145 详细报告 --- |
||||||
|
原始文本长度: 42 字符 |
||||||
|
最终文本长度: 42 字符 |
||||||
|
总体压缩比: 0.00% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 1 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
|
||||||
|
--- ID 1146 详细报告 --- |
||||||
|
原始文本长度: 771 字符 |
||||||
|
最终文本长度: 771 字符 |
||||||
|
总体压缩比: 0.00% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 1 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
|
||||||
|
--- ID 1147 详细报告 --- |
||||||
|
原始文本长度: 1183 字符 |
||||||
|
最终文本长度: 1183 字符 |
||||||
|
总体压缩比: 0.00% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 1 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
|
||||||
|
--- ID 1148 详细报告 --- |
||||||
|
原始文本长度: 1184 字符 |
||||||
|
最终文本长度: 1184 字符 |
||||||
|
总体压缩比: 0.00% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 1 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
|
||||||
|
--- ID 1149 详细报告 --- |
||||||
|
原始文本长度: 3964 字符 |
||||||
|
最终文本长度: 3964 字符 |
||||||
|
总体压缩比: 0.00% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 1 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
|
||||||
|
--- ID 1150 详细报告 --- |
||||||
|
原始文本长度: 1263 字符 |
||||||
|
最终文本长度: 1191 字符 |
||||||
|
总体压缩比: 5.70% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 73 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 3 项内容 |
||||||
|
|
||||||
|
--- ID 1151 详细报告 --- |
||||||
|
原始文本长度: 1611 字符 |
||||||
|
最终文本长度: 1524 字符 |
||||||
|
总体压缩比: 5.40% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 88 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 8 项内容 |
||||||
|
|
||||||
|
--- ID 1152 详细报告 --- |
||||||
|
原始文本长度: 1810 字符 |
||||||
|
最终文本长度: 1046 字符 |
||||||
|
总体压缩比: 42.21% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 765 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 16 项内容 |
||||||
|
|
||||||
|
--- ID 1153 详细报告 --- |
||||||
|
原始文本长度: 835 字符 |
||||||
|
最终文本长度: 835 字符 |
||||||
|
总体压缩比: 0.00% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 1 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
File diff suppressed because one or more lines are too long
@ -0,0 +1,155 @@ |
|||||||
|
import re |
||||||
|
import json |
||||||
|
import pandas as pd |
||||||
|
import os |
||||||
|
|
||||||
|
def split_sentences(text): |
||||||
|
# 使用捕获组来保留分隔符 |
||||||
|
parts = re.split(r'([。!?])', text) |
||||||
|
|
||||||
|
# 重新组合句子和标点符号 |
||||||
|
sentences = [] |
||||||
|
for i in range(0, len(parts), 2): |
||||||
|
if i < len(parts) and parts[i].strip(): |
||||||
|
# 如果有对应的标点符号,就加上 |
||||||
|
punctuation = parts[i + 1] if i + 1 < len(parts) else '' |
||||||
|
sentence = parts[i].strip() + punctuation |
||||||
|
sentences.append(sentence) |
||||||
|
|
||||||
|
return sentences |
||||||
|
def create_sentence_pairs(sentences): |
||||||
|
pairs = [] |
||||||
|
for i in range(len(sentences) - 1): |
||||||
|
pair = { |
||||||
|
"sentence1": sentences[i], |
||||||
|
"sentence2": sentences[i + 1], |
||||||
|
"label": -1 # 待标注 |
||||||
|
} |
||||||
|
pairs.append(pair) |
||||||
|
return pairs |
||||||
|
|
||||||
|
|
||||||
|
# 从CSV文件中读取所有内容 |
||||||
|
try: |
||||||
|
# 尝试不同的编码格式读取CSV文件 |
||||||
|
encodings = ['utf-8', 'gbk', 'gb2312', 'utf-8-sig', 'latin1'] |
||||||
|
df = None |
||||||
|
|
||||||
|
for encoding in encodings: |
||||||
|
try: |
||||||
|
print(f"尝试使用 {encoding} 编码读取文件...") |
||||||
|
df = pd.read_csv('batch_deduplication_results_619-1103_01.csv', encoding=encoding) |
||||||
|
print(f"成功使用 {encoding} 编码读取文件") |
||||||
|
break |
||||||
|
except UnicodeDecodeError: |
||||||
|
continue |
||||||
|
|
||||||
|
if df is None: |
||||||
|
print("错误:尝试了所有常见编码都无法读取文件") |
||||||
|
exit() |
||||||
|
|
||||||
|
except FileNotFoundError: |
||||||
|
print("错误:找不到文件 'batch_deduplication_results_619-1103_01.csv'") |
||||||
|
exit() |
||||||
|
except Exception as e: |
||||||
|
print(f"读取CSV文件时发生错误:{e}") |
||||||
|
exit() |
||||||
|
|
||||||
|
# 创建输出目录 |
||||||
|
output_dir = 'sentence_pairs_output_all' |
||||||
|
if not os.path.exists(output_dir): |
||||||
|
os.makedirs(output_dir) |
||||||
|
|
||||||
|
# 汇总所有数据 |
||||||
|
all_sentence_pairs = [] |
||||||
|
summary_info = [] |
||||||
|
|
||||||
|
print(f"CSV文件共有 {len(df)} 行数据") |
||||||
|
print("开始遍历所有ID...") |
||||||
|
|
||||||
|
# 遍历所有行 |
||||||
|
for index, row in df.iterrows(): |
||||||
|
try: |
||||||
|
current_id = row['id'] |
||||||
|
raw_text = row['final_processed_text'] |
||||||
|
|
||||||
|
# 检查文本是否为空 |
||||||
|
if pd.isna(raw_text) or str(raw_text).strip() == '': |
||||||
|
print(f"ID {current_id}: 文本内容为空,跳过") |
||||||
|
summary_info.append({ |
||||||
|
'id': current_id, |
||||||
|
'status': '文本为空', |
||||||
|
'sentences_count': 0, |
||||||
|
'pairs_count': 0 |
||||||
|
}) |
||||||
|
continue |
||||||
|
|
||||||
|
# 执行分割和配对 |
||||||
|
sentences = split_sentences(str(raw_text)) |
||||||
|
sentence_pairs = create_sentence_pairs(sentences) |
||||||
|
|
||||||
|
# 为每个句子对添加来源ID |
||||||
|
for pair in sentence_pairs: |
||||||
|
pair['source_id'] = current_id |
||||||
|
|
||||||
|
# 添加到汇总数据 |
||||||
|
all_sentence_pairs.extend(sentence_pairs) |
||||||
|
|
||||||
|
# 为每个ID单独保存文件 |
||||||
|
if sentence_pairs: # 只有当有句子对时才保存 |
||||||
|
filename = f'sentence_pairs_id_{current_id}.json' |
||||||
|
filepath = os.path.join(output_dir, filename) |
||||||
|
with open(filepath, 'w', encoding='utf-8') as f: |
||||||
|
json.dump(sentence_pairs, f, ensure_ascii=False, indent=2) |
||||||
|
|
||||||
|
# 记录处理信息 |
||||||
|
summary_info.append({ |
||||||
|
'id': current_id, |
||||||
|
'status': '成功处理', |
||||||
|
'sentences_count': len(sentences), |
||||||
|
'pairs_count': len(sentence_pairs), |
||||||
|
'text_length': len(str(raw_text)) |
||||||
|
}) |
||||||
|
|
||||||
|
print(f"ID {current_id}: 分割出 {len(sentences)} 个句子,生成 {len(sentence_pairs)} 个句子对") |
||||||
|
|
||||||
|
except Exception as e: |
||||||
|
print(f"处理ID {current_id} 时发生错误:{e}") |
||||||
|
summary_info.append({ |
||||||
|
'id': current_id, |
||||||
|
'status': f'错误: {str(e)}', |
||||||
|
'sentences_count': 0, |
||||||
|
'pairs_count': 0 |
||||||
|
}) |
||||||
|
|
||||||
|
# 保存汇总的所有句子对数据 |
||||||
|
print("\n保存汇总数据...") |
||||||
|
with open('all_sentence_pairs_for_annotation.json', 'w', encoding='utf-8') as f: |
||||||
|
json.dump(all_sentence_pairs, f, ensure_ascii=False, indent=2) |
||||||
|
|
||||||
|
# 保存处理摘要 |
||||||
|
summary_df = pd.DataFrame(summary_info) |
||||||
|
summary_df.to_csv('processing_summary.csv', index=False, encoding='utf-8-sig') |
||||||
|
|
||||||
|
# 统计信息 |
||||||
|
total_sentences = sum([info['sentences_count'] for info in summary_info]) |
||||||
|
total_pairs = sum([info['pairs_count'] for info in summary_info]) |
||||||
|
successful_ids = len([info for info in summary_info if info['status'] == '成功处理']) |
||||||
|
|
||||||
|
print(f"\n=== 处理完成 ===") |
||||||
|
print(f"总计处理了 {len(df)} 个ID") |
||||||
|
print(f"成功处理 {successful_ids} 个ID") |
||||||
|
print(f"总计分割出 {total_sentences} 个句子") |
||||||
|
print(f"总计生成 {total_pairs} 个句子对") |
||||||
|
print(f"汇总数据保存到: all_sentence_pairs_for_annotation.json") |
||||||
|
print(f"单独文件保存在: {output_dir}/ 目录") |
||||||
|
print(f"处理摘要保存到: processing_summary.csv") |
||||||
|
|
||||||
|
# 显示前几个句子对的示例 |
||||||
|
if all_sentence_pairs: |
||||||
|
print("\n前3个句子对示例:") |
||||||
|
for i in range(min(3, len(all_sentence_pairs))): |
||||||
|
print(f"\n第{i + 1}对 (来源ID: {all_sentence_pairs[i]['source_id']}):") |
||||||
|
print(f"句子1: {all_sentence_pairs[i]['sentence1']}") |
||||||
|
print(f"句子2: {all_sentence_pairs[i]['sentence2']}") |
||||||
|
print(f"标签: {all_sentence_pairs[i]['label']}") |
||||||
@ -0,0 +1,514 @@ |
|||||||
|
import requests |
||||||
|
import json |
||||||
|
import pandas as pd |
||||||
|
from typing import List, Dict |
||||||
|
import time |
||||||
|
|
||||||
|
|
||||||
|
class SimpleOpenAIHubClient: |
||||||
|
def __init__(self, api_key): |
||||||
|
self.api_key = api_key |
||||||
|
self.base_url = "https://api.openai-hub.com" |
||||||
|
self.headers = { |
||||||
|
"Authorization": f"Bearer {api_key}", |
||||||
|
"Content-Type": "application/json" |
||||||
|
} |
||||||
|
|
||||||
|
def chat(self, prompt, model="gpt-4.1"): |
||||||
|
"""发送prompt并返回模型回答""" |
||||||
|
payload = { |
||||||
|
"model": model, |
||||||
|
"messages": [ |
||||||
|
{ |
||||||
|
"role": "user", |
||||||
|
"content": prompt |
||||||
|
} |
||||||
|
], |
||||||
|
"max_tokens": 32768, |
||||||
|
"temperature": 0.7 |
||||||
|
} |
||||||
|
|
||||||
|
try: |
||||||
|
response = requests.post( |
||||||
|
f"{self.base_url}/v1/chat/completions", |
||||||
|
headers=self.headers, |
||||||
|
json=payload, |
||||||
|
timeout=60 |
||||||
|
) |
||||||
|
|
||||||
|
if response.status_code == 200: |
||||||
|
result = response.json() |
||||||
|
return result['choices'][0]['message']['content'] |
||||||
|
else: |
||||||
|
return f"错误: {response.status_code} - {response.text}" |
||||||
|
except requests.exceptions.RequestException as e: |
||||||
|
return f"请求异常: {str(e)}" |
||||||
|
|
||||||
|
|
||||||
|
print("AI客户端类定义完成!") |
||||||
|
|
||||||
|
# 设置API Key |
||||||
|
API_KEY = "sk-XREp2jnIXyZ6UoCnzZeO0ahmLi9OEXuVAtFLojKFpG9gCZ4e" # 请替换为你的实际API Key |
||||||
|
|
||||||
|
# 初始化AI客户端 |
||||||
|
client = SimpleOpenAIHubClient(API_KEY) |
||||||
|
print("AI模型加载完成!") |
||||||
|
|
||||||
|
# 定义批量标注的Prompt模板 |
||||||
|
BATCH_SEGMENTATION_PROMPT = """你是一个专业的广播内容段落分割标注员。你的任务是批量判断多个相邻句子对之间是否应该进行段落分割,以便广播员更好地掌握停顿和语调变化。 |
||||||
|
|
||||||
|
**完整文本内容上下文:** |
||||||
|
{context_text} |
||||||
|
|
||||||
|
**标注规则:** |
||||||
|
- 标签0:两个句子属于同一段落,连续播报,轻微停顿 |
||||||
|
- 标签1:两个句子属于不同段落,需要明显停顿或语调转换 |
||||||
|
|
||||||
|
**重要标注要求(请严格遵循):** |
||||||
|
- 如果整个文本内容都在讲同一个事,你有理由只输出一段,不是追求分的段越多越细就越好 |
||||||
|
- 每个分段必须保持原始语句的绝对顺序 |
||||||
|
- 最终分段数可能等于或小于原始语句数量 |
||||||
|
- 必须保留所有原始语句文本,不得遗漏任何内容 |
||||||
|
- 应客户强烈要求,他们需要的是较粗的分段,不要太细,如同一条通告,不需要分段成具体的每个条款之类的,只需要将整个相同的通告分成一段 |
||||||
|
- 优先考虑较粗的分段,避免过度细分 |
||||||
|
|
||||||
|
**广播分段判断依据(偏向粗分段):** |
||||||
|
1. **重大主题转换**:从一个完全不同的话题转向另一个话题(如从天气预报转向安全通知) |
||||||
|
2. **文档类型变化**:从一个完整文档转向另一个完整文档(如从禁火令转向倡议书) |
||||||
|
3. **内容性质变化**:从通知类内容转向完全不同性质的内容(如从法规转向天气预报) |
||||||
|
4. **广播节目段落**:明显的广播节目结构变化(如开场白结束进入正式内容) |
||||||
|
5. **分点阐述结构**:标题和所有分点条目内容应该合并为一个完整段落(如"森林防火十不准,一不乱扔烟头,二不随意丢弃火种,三不在林区吸烟"等整体合成一段) |
||||||
|
|
||||||
|
**广播内容特别注意(粗分段原则):** |
||||||
|
- 整个通告、法令、倡议书等应作为一个段落,不要拆分条款 |
||||||
|
- 同一主题的多个条款应保持在同一段落 |
||||||
|
- 只有在完全不同的文档或重大主题转换时才分段 |
||||||
|
- 广播开场白可以独立成段,但具体内容尽量合并 |
||||||
|
- 同一类型的预报信息(如天气预报的不同地区)应保持在同一段 |
||||||
|
- **分点阐述内容的特殊处理**: |
||||||
|
- 标题性内容(如"森林防火十不准")与分点条目内容之间不需要分段 |
||||||
|
- 标题和所有的分点条目(如"一不乱扔烟头"、"二不随意丢弃火种"、"三不在林区吸烟"等)应该合并为一个完整段落 |
||||||
|
- 分点条目之间不需要分段,应该连续播报 |
||||||
|
- 整个分点阐述结构作为一个完整的内容单元,保持连贯性 |
||||||
|
|
||||||
|
**批量标注说明:** |
||||||
|
- 每个句子对都有一个source_id,表示来源文档 |
||||||
|
- 请保持原有的source_id不变 |
||||||
|
- 将label从-1改为实际的标注结果(0或1) |
||||||
|
- 为每个句子对提供简要的分段理由 |
||||||
|
- 结合上述完整文本内容理解句子对的上下文语境 |
||||||
|
- **特别重要:倾向于标注更多的0(同段落),减少1(分段)的使用,分点阐述结构应保持为一个完整段落** |
||||||
|
|
||||||
|
现在请对以下句子对进行批量标注: |
||||||
|
|
||||||
|
{batch_sentence_pairs} |
||||||
|
|
||||||
|
请直接输出标注结果,格式如下: |
||||||
|
```json |
||||||
|
[ |
||||||
|
{{ |
||||||
|
"sentence1": "...", |
||||||
|
"sentence2": "...", |
||||||
|
"label": 0或1, |
||||||
|
"reason": "广播分段理由", |
||||||
|
"source_id": 原有的source_id |
||||||
|
}} |
||||||
|
] |
||||||
|
``` |
||||||
|
|
||||||
|
只输出JSON数据,不要其他说明文字。""" |
||||||
|
|
||||||
|
|
||||||
|
def load_context_data(csv_file="batch_deduplication_results_619-1103_01.csv"): |
||||||
|
""" |
||||||
|
从batch_deduplication_results_batch_deduplication_results_619-1103_01.csv加载上下文数据 |
||||||
|
|
||||||
|
Args: |
||||||
|
csv_file: CSV文件路径 |
||||||
|
|
||||||
|
Returns: |
||||||
|
字典,key为id,value为final_processed_text |
||||||
|
""" |
||||||
|
try: |
||||||
|
print(f"正在读取上下文数据文件: {csv_file}") |
||||||
|
|
||||||
|
# 尝试不同的编码格式 |
||||||
|
encodings = ['utf-8', 'gbk', 'gb2312', 'utf-8-sig', 'latin-1', 'cp1252'] |
||||||
|
context_df = None |
||||||
|
|
||||||
|
for encoding in encodings: |
||||||
|
try: |
||||||
|
print(f" 尝试使用 {encoding} 编码...") |
||||||
|
context_df = pd.read_csv(csv_file, encoding=encoding) |
||||||
|
print(f" ✓ 成功使用 {encoding} 编码读取文件") |
||||||
|
break |
||||||
|
except UnicodeDecodeError: |
||||||
|
print(f" {encoding} 编码失败") |
||||||
|
continue |
||||||
|
except Exception as e: |
||||||
|
print(f" {encoding} 编码读取时出现其他错误:{str(e)}") |
||||||
|
continue |
||||||
|
|
||||||
|
if context_df is None: |
||||||
|
print(f"✗ 错误:尝试了所有编码格式都无法读取文件 {csv_file}") |
||||||
|
return {} |
||||||
|
|
||||||
|
if 'id' not in context_df.columns or 'final_processed_text' not in context_df.columns: |
||||||
|
print(f"✗ 错误:CSV文件缺少必需列") |
||||||
|
print(f" 需要的列: ['id', 'final_processed_text']") |
||||||
|
print(f" 实际的列: {list(context_df.columns)}") |
||||||
|
return {} |
||||||
|
|
||||||
|
# 创建id到final_processed_text的映射 |
||||||
|
context_dict = {} |
||||||
|
for _, row in context_df.iterrows(): |
||||||
|
context_dict[row['id']] = row['final_processed_text'] if pd.notna(row['final_processed_text']) else "" |
||||||
|
|
||||||
|
print(f"✓ 成功加载上下文数据") |
||||||
|
print(f" - 可用ID数量: {len(context_dict)}") |
||||||
|
print(f" - 可用ID列表: {sorted(context_dict.keys())}") |
||||||
|
|
||||||
|
return context_dict |
||||||
|
|
||||||
|
except FileNotFoundError: |
||||||
|
print(f"✗ 警告:找不到上下文文件 {csv_file}") |
||||||
|
print(" 将在没有上下文的情况下进行标注") |
||||||
|
return {} |
||||||
|
except Exception as e: |
||||||
|
print(f"✗ 读取上下文文件时出错: {str(e)}") |
||||||
|
print(" 将在没有上下文的情况下进行标注") |
||||||
|
return {} |
||||||
|
|
||||||
|
|
||||||
|
def process_batch_segmentation(sentence_pairs_data, context_dict, batch_size=10): |
||||||
|
""" |
||||||
|
批量处理句子对的段落分割标注 |
||||||
|
|
||||||
|
Args: |
||||||
|
sentence_pairs_data: 句子对数据列表 |
||||||
|
context_dict: 上下文数据字典 |
||||||
|
batch_size: 每批处理的数量 |
||||||
|
|
||||||
|
Returns: |
||||||
|
处理结果列表 |
||||||
|
""" |
||||||
|
all_results = [] |
||||||
|
total_pairs = len(sentence_pairs_data) |
||||||
|
|
||||||
|
print(f"开始批量标注,总共 {total_pairs} 个句子对") |
||||||
|
print(f"每批处理 {batch_size} 个句子对") |
||||||
|
|
||||||
|
# 分批处理 |
||||||
|
for i in range(0, total_pairs, batch_size): |
||||||
|
batch_end = min(i + batch_size, total_pairs) |
||||||
|
current_batch = sentence_pairs_data[i:batch_end] |
||||||
|
|
||||||
|
print(f"\n处理第 {i // batch_size + 1} 批 (句子对 {i + 1}-{batch_end})") |
||||||
|
|
||||||
|
try: |
||||||
|
# 获取当前批次涉及的source_id的上下文 |
||||||
|
source_ids_in_batch = set(pair['source_id'] for pair in current_batch) |
||||||
|
context_text = "" |
||||||
|
|
||||||
|
for source_id in sorted(source_ids_in_batch): |
||||||
|
if source_id in context_dict and context_dict[source_id]: |
||||||
|
context_text += f"\n--- Source ID {source_id} 完整文本内容 ---\n" |
||||||
|
context_text += context_dict[source_id] # 完整内容,不截断 |
||||||
|
context_text += "\n" |
||||||
|
else: |
||||||
|
context_text += f"\n--- Source ID {source_id} ---\n(未找到对应的完整文本内容)\n" |
||||||
|
|
||||||
|
# 准备当前批次的数据 |
||||||
|
batch_json = json.dumps(current_batch, ensure_ascii=False, indent=2) |
||||||
|
|
||||||
|
# 构建prompt |
||||||
|
prompt = BATCH_SEGMENTATION_PROMPT.format( |
||||||
|
context_text=context_text, |
||||||
|
batch_sentence_pairs=batch_json |
||||||
|
) |
||||||
|
|
||||||
|
print(f"发送请求到AI模型...") |
||||||
|
print(f" - 涉及source_id: {sorted(source_ids_in_batch)}") |
||||||
|
print(f" - 上下文长度: {len(context_text)} 字符") |
||||||
|
|
||||||
|
# 调用AI模型 |
||||||
|
ai_response = client.chat(prompt) |
||||||
|
|
||||||
|
print(f"收到模型响应") |
||||||
|
|
||||||
|
# 尝试解析JSON响应 |
||||||
|
try: |
||||||
|
# 提取JSON部分(去除可能的markdown格式) |
||||||
|
json_start = ai_response.find('[') |
||||||
|
json_end = ai_response.rfind(']') + 1 |
||||||
|
|
||||||
|
if json_start != -1 and json_end != 0: |
||||||
|
json_content = ai_response[json_start:json_end] |
||||||
|
batch_deduplication_results_all = json.loads(json_content) |
||||||
|
|
||||||
|
# 验证结果 |
||||||
|
if isinstance(batch_deduplication_results_all, list) and len(batch_deduplication_results_all) == len(current_batch): |
||||||
|
all_results.extend(batch_deduplication_results_all) |
||||||
|
print(f"✓ 成功处理 {len(batch_deduplication_results_all)} 个句子对") |
||||||
|
else: |
||||||
|
print( |
||||||
|
f"✗ 响应格式不正确,期望 {len(current_batch)} 个结果,实际得到 {len(batch_deduplication_results_all) if isinstance(batch_deduplication_results_all, list) else 'non-list'}") |
||||||
|
# 添加错误记录 |
||||||
|
for j, pair in enumerate(current_batch): |
||||||
|
all_results.append({ |
||||||
|
"sentence1": pair["sentence1"], |
||||||
|
"sentence2": pair["sentence2"], |
||||||
|
"label": -1, |
||||||
|
"reason": "处理失败:响应格式错误", |
||||||
|
"source_id": pair["source_id"] |
||||||
|
}) |
||||||
|
else: |
||||||
|
print(f"✗ 无法找到有效的JSON响应") |
||||||
|
print(f"原始响应前200字符: {ai_response[:200]}...") |
||||||
|
# 添加错误记录 |
||||||
|
for j, pair in enumerate(current_batch): |
||||||
|
all_results.append({ |
||||||
|
"sentence1": pair["sentence1"], |
||||||
|
"sentence2": pair["sentence2"], |
||||||
|
"label": -1, |
||||||
|
"reason": "处理失败:JSON解析错误", |
||||||
|
"source_id": pair["source_id"] |
||||||
|
}) |
||||||
|
|
||||||
|
except json.JSONDecodeError as e: |
||||||
|
print(f"✗ JSON解析失败: {str(e)}") |
||||||
|
print(f"原始响应: {ai_response[:200]}...") |
||||||
|
|
||||||
|
# 添加错误记录 |
||||||
|
for j, pair in enumerate(current_batch): |
||||||
|
all_results.append({ |
||||||
|
"sentence1": pair["sentence1"], |
||||||
|
"sentence2": pair["sentence2"], |
||||||
|
"label": -1, |
||||||
|
"reason": f"处理失败:{str(e)}", |
||||||
|
"source_id": pair["source_id"] |
||||||
|
}) |
||||||
|
|
||||||
|
# 添加延时,避免API调用过于频繁 |
||||||
|
time.sleep(2) |
||||||
|
|
||||||
|
except Exception as e: |
||||||
|
print(f"✗ 批次处理出错: {str(e)}") |
||||||
|
|
||||||
|
# 添加错误记录 |
||||||
|
for j, pair in enumerate(current_batch): |
||||||
|
all_results.append({ |
||||||
|
"sentence1": pair["sentence1"], |
||||||
|
"sentence2": pair["sentence2"], |
||||||
|
"label": -1, |
||||||
|
"reason": f"处理异常:{str(e)}", |
||||||
|
"source_id": pair["source_id"] |
||||||
|
}) |
||||||
|
|
||||||
|
return all_results |
||||||
|
|
||||||
|
# 执行批量标注 |
||||||
|
print("=" * 60) |
||||||
|
print("开始执行批量段落分割标注(从source_id 7开始)") |
||||||
|
print("=" * 60) |
||||||
|
|
||||||
|
# 加载上下文数据 |
||||||
|
context_dict = load_context_data("batch_deduplication_results_619-1103_01.csv") |
||||||
|
|
||||||
|
# 从JSON文件加载数据 |
||||||
|
input_file = "all_sentence_pairs_for_annotation.json" |
||||||
|
|
||||||
|
try: |
||||||
|
print(f"正在读取数据文件: {input_file}") |
||||||
|
with open(input_file, 'r', encoding='utf-8') as f: |
||||||
|
all_sentence_pairs_data = json.load(f) |
||||||
|
|
||||||
|
print(f"✓ 成功加载数据文件") |
||||||
|
print(f" - 总句子对数量: {len(all_sentence_pairs_data)}") |
||||||
|
|
||||||
|
# 检查数据格式 |
||||||
|
if len(all_sentence_pairs_data) > 0: |
||||||
|
sample_item = all_sentence_pairs_data[0] |
||||||
|
required_fields = ['sentence1', 'sentence2', 'source_id'] |
||||||
|
missing_fields = [field for field in required_fields if field not in sample_item] |
||||||
|
|
||||||
|
if missing_fields: |
||||||
|
print(f"✗ 数据格式错误,缺少字段: {missing_fields}") |
||||||
|
print(f" 实际字段: {list(sample_item.keys())}") |
||||||
|
exit() |
||||||
|
|
||||||
|
# 获取所有unique的source_id |
||||||
|
all_source_ids = sorted(set(item.get('source_id') for item in all_sentence_pairs_data)) |
||||||
|
print(f"✓ 发现的source_id列表: {all_source_ids}") |
||||||
|
|
||||||
|
# 【修改】:筛选出source_id >= 7的ID |
||||||
|
filtered_source_ids = [sid for sid in all_source_ids if sid >= 7] |
||||||
|
print(f"✓ 筛选后的source_id列表(>=7): {filtered_source_ids}") |
||||||
|
|
||||||
|
if not filtered_source_ids: |
||||||
|
print("✗ 没有找到source_id >= 7的数据") |
||||||
|
exit() |
||||||
|
|
||||||
|
# 统计各source_id的句子对数量 |
||||||
|
from collections import Counter |
||||||
|
|
||||||
|
source_counts = Counter(item.get('source_id') for item in all_sentence_pairs_data) |
||||||
|
print(f" - 各source_id的句子对数量(>=7):") |
||||||
|
for source_id in filtered_source_ids: |
||||||
|
print(f" source_id {source_id}: {source_counts[source_id]} 对") |
||||||
|
|
||||||
|
# 检查上下文数据可用性 |
||||||
|
print(f"\n上下文数据可用性检查:") |
||||||
|
available_context_ids = [] |
||||||
|
missing_context_ids = [] |
||||||
|
|
||||||
|
for source_id in filtered_source_ids: |
||||||
|
if source_id in context_dict and context_dict[source_id]: |
||||||
|
available_context_ids.append(source_id) |
||||||
|
print(f" ✓ source_id {source_id}: 上下文长度 {len(context_dict[source_id])} 字符") |
||||||
|
else: |
||||||
|
missing_context_ids.append(source_id) |
||||||
|
print(f" ✗ source_id {source_id}: 缺少上下文数据") |
||||||
|
|
||||||
|
if missing_context_ids: |
||||||
|
print(f"\n警告:以下source_id缺少上下文数据: {missing_context_ids}") |
||||||
|
print("这些ID的标注可能不够准确") |
||||||
|
|
||||||
|
print(f"\n开始处理source_id >= 7的标注任务...") |
||||||
|
|
||||||
|
except FileNotFoundError: |
||||||
|
print(f"✗ 错误:找不到文件 {input_file}") |
||||||
|
print("请确保文件存在于当前目录中") |
||||||
|
exit() |
||||||
|
except json.JSONDecodeError as e: |
||||||
|
print(f"✗ 错误:JSON文件格式错误 - {str(e)}") |
||||||
|
exit() |
||||||
|
except Exception as e: |
||||||
|
print(f"✗ 错误:读取文件时出现异常 - {str(e)}") |
||||||
|
exit() |
||||||
|
|
||||||
|
# 准备所有结果 |
||||||
|
all_results = [] |
||||||
|
|
||||||
|
# 【修改】:遍历筛选后的source_id(>=7) |
||||||
|
for current_source_id in filtered_source_ids: |
||||||
|
print(f"\n{'=' * 50}") |
||||||
|
print(f"处理 source_id: {current_source_id}") |
||||||
|
print(f"{'=' * 50}") |
||||||
|
|
||||||
|
# 筛选当前source_id的数据 |
||||||
|
current_sentence_pairs = [item for item in all_sentence_pairs_data if item.get('source_id') == current_source_id] |
||||||
|
|
||||||
|
if len(current_sentence_pairs) == 0: |
||||||
|
print(f"✗ 警告:source_id={current_source_id} 没有找到句子对数据") |
||||||
|
continue |
||||||
|
|
||||||
|
print(f" - 当前source_id的句子对数量: {len(current_sentence_pairs)}") |
||||||
|
|
||||||
|
# 显示第一个句子对预览 |
||||||
|
if len(current_sentence_pairs) > 0: |
||||||
|
first_pair = current_sentence_pairs[0] |
||||||
|
print(f" - 第一个句子对预览:") |
||||||
|
print(f" 句子1: {first_pair['sentence1'][:60]}...") |
||||||
|
print(f" 句子2: {first_pair['sentence2'][:60]}...") |
||||||
|
|
||||||
|
# 检查上下文数据 |
||||||
|
if current_source_id in context_dict and context_dict[current_source_id]: |
||||||
|
print(f" - 上下文数据: 可用,长度 {len(context_dict[current_source_id])} 字符") |
||||||
|
print(f" - 上下文预览: {context_dict[current_source_id][:100]}...") |
||||||
|
else: |
||||||
|
print(f" - 上下文数据: 不可用") |
||||||
|
|
||||||
|
try: |
||||||
|
print(f" - 开始处理标注...") |
||||||
|
|
||||||
|
# 执行批量标注(每批处理5个句子对) |
||||||
|
current_results = process_batch_segmentation(current_sentence_pairs, context_dict, batch_size=8) |
||||||
|
|
||||||
|
# 添加到总结果中 |
||||||
|
all_results.extend(current_results) |
||||||
|
|
||||||
|
# 统计当前source_id的结果 |
||||||
|
current_successful = len([r for r in current_results if r['label'] in [0, 1]]) |
||||||
|
current_failed = len([r for r in current_results if r['label'] == -1]) |
||||||
|
|
||||||
|
print(f" ✓ source_id {current_source_id} 处理完成") |
||||||
|
print(f" - 成功标注: {current_successful}") |
||||||
|
print(f" - 失败数量: {current_failed}") |
||||||
|
|
||||||
|
if current_successful > 0: |
||||||
|
current_label_0 = len([r for r in current_results if r['label'] == 0]) |
||||||
|
current_label_1 = len([r for r in current_results if r['label'] == 1]) |
||||||
|
print(f" - 标签0(同段落): {current_label_0}") |
||||||
|
print(f" - 标签1(分段): {current_label_1}") |
||||||
|
|
||||||
|
# 添加延时,避免处理过快 |
||||||
|
time.sleep(1) |
||||||
|
|
||||||
|
except Exception as e: |
||||||
|
print(f" ✗ source_id {current_source_id} 处理失败: {str(e)}") |
||||||
|
|
||||||
|
# 为失败的source_id添加错误记录 |
||||||
|
for pair in current_sentence_pairs: |
||||||
|
all_results.append({ |
||||||
|
"sentence1": pair["sentence1"], |
||||||
|
"sentence2": pair["sentence2"], |
||||||
|
"label": -1, |
||||||
|
"reason": f"source_id处理异常:{str(e)}", |
||||||
|
"source_id": pair["source_id"] |
||||||
|
}) |
||||||
|
|
||||||
|
print(f"\n{'=' * 60}") |
||||||
|
print("所有source_id(>=7)处理完成!") |
||||||
|
print(f"{'=' * 60}") |
||||||
|
|
||||||
|
# 使用所有结果进行后续处理 |
||||||
|
results = all_results |
||||||
|
|
||||||
|
print(f"\n{'=' * 60}") |
||||||
|
print("批量标注完成!") |
||||||
|
print(f"{'=' * 60}") |
||||||
|
|
||||||
|
# 统计结果 |
||||||
|
total_processed = len(results) |
||||||
|
successful_labels = len([r for r in results if r['label'] in [0, 1]]) |
||||||
|
failed_labels = len([r for r in results if r['label'] == -1]) |
||||||
|
|
||||||
|
print(f"总处理数量: {total_processed}") |
||||||
|
print(f"成功标注: {successful_labels}") |
||||||
|
print(f"失败数量: {failed_labels}") |
||||||
|
|
||||||
|
if successful_labels > 0: |
||||||
|
label_0_count = len([r for r in results if r['label'] == 0]) |
||||||
|
label_1_count = len([r for r in results if r['label'] == 1]) |
||||||
|
print(f"标签0(同段落): {label_0_count}") |
||||||
|
print(f"标签1(分段): {label_1_count}") |
||||||
|
|
||||||
|
# 【修改】:保存结果到CSV,文件名增加后缀以区分 |
||||||
|
result_df = pd.DataFrame(results) |
||||||
|
output_file = "segmentation_labeling_results_from_7.csv" |
||||||
|
result_df.to_csv(output_file, index=False, encoding='utf-8-sig') |
||||||
|
|
||||||
|
print(f"\n结果已保存到: {output_file}") |
||||||
|
|
||||||
|
# 显示前几条结果 |
||||||
|
print(f"\n前3条标注结果预览:") |
||||||
|
for i, result in enumerate(results[:3]): |
||||||
|
print(f"\n{i + 1}. Source ID: {result['source_id']}") |
||||||
|
print(f" 句子1: {result['sentence1'][:50]}...") |
||||||
|
print(f" 句子2: {result['sentence2'][:50]}...") |
||||||
|
print(f" 标签: {result['label']}") |
||||||
|
print(f" 理由: {result['reason']}") |
||||||
|
|
||||||
|
# 【修改】:保存详细的JSON结果,文件名增加后缀以区分 |
||||||
|
json_output_file = 'segmentation_results_from_7.json' |
||||||
|
with open(json_output_file, 'w', encoding='utf-8') as f: |
||||||
|
json.dump(results, f, ensure_ascii=False, indent=2) |
||||||
|
|
||||||
|
print(f"详细JSON结果已保存到: {json_output_file}") |
||||||
|
|
||||||
|
# 【新增】:显示处理的source_id范围统计 |
||||||
|
if results: |
||||||
|
processed_source_ids = sorted(set(r['source_id'] for r in results)) |
||||||
|
print(f"\n实际处理的source_id范围: {min(processed_source_ids)} - {max(processed_source_ids)}") |
||||||
|
print(f"共处理了 {len(processed_source_ids)} 个source_id") |
||||||
@ -0,0 +1,581 @@ |
|||||||
|
import requests |
||||||
|
import json |
||||||
|
import pandas as pd |
||||||
|
from typing import List, Dict |
||||||
|
import time |
||||||
|
|
||||||
|
|
||||||
|
class SimpleOpenAIHubClient: |
||||||
|
def __init__(self, api_key): |
||||||
|
self.api_key = api_key |
||||||
|
self.base_url = "https://api.openai-hub.com" |
||||||
|
self.headers = { |
||||||
|
"Authorization": f"Bearer {api_key}", |
||||||
|
"Content-Type": "application/json" |
||||||
|
} |
||||||
|
|
||||||
|
def chat(self, prompt, model="gpt-4.1"): |
||||||
|
"""发送prompt并返回模型回答""" |
||||||
|
payload = { |
||||||
|
"model": model, |
||||||
|
"messages": [ |
||||||
|
{ |
||||||
|
"role": "user", |
||||||
|
"content": prompt |
||||||
|
} |
||||||
|
], |
||||||
|
"max_tokens": 32768, |
||||||
|
"temperature": 0.7 |
||||||
|
} |
||||||
|
|
||||||
|
try: |
||||||
|
response = requests.post( |
||||||
|
f"{self.base_url}/v1/chat/completions", |
||||||
|
headers=self.headers, |
||||||
|
json=payload, |
||||||
|
timeout=60 |
||||||
|
) |
||||||
|
|
||||||
|
if response.status_code == 200: |
||||||
|
result = response.json() |
||||||
|
return result['choices'][0]['message']['content'] |
||||||
|
else: |
||||||
|
return f"错误: {response.status_code} - {response.text}" |
||||||
|
except requests.exceptions.RequestException as e: |
||||||
|
return f"请求异常: {str(e)}" |
||||||
|
|
||||||
|
|
||||||
|
print("AI客户端类定义完成!") |
||||||
|
|
||||||
|
# 设置API Key |
||||||
|
API_KEY = "sk-XREp2jnIXyZ6UoCnzZeO0ahmLi9OEXuVAtFLojKFpG9gCZ4e" # 请替换为你的实际API Key |
||||||
|
|
||||||
|
# 初始化AI客户端 |
||||||
|
client = SimpleOpenAIHubClient(API_KEY) |
||||||
|
print("AI模型加载完成!") |
||||||
|
|
||||||
|
# 定义批量标注的Prompt模板 |
||||||
|
BATCH_SEGMENTATION_PROMPT = """你是一个专业的广播内容段落分割标注员。你的任务是批量判断多个相邻句子对之间是否应该进行段落分割,以便广播员更好地掌握停顿和语调变化。 |
||||||
|
|
||||||
|
**完整文本内容上下文:** |
||||||
|
{context_text} |
||||||
|
|
||||||
|
**标注规则:** |
||||||
|
- 标签0:两个句子属于同一段落,连续播报,轻微停顿 |
||||||
|
- 标签1:两个句子属于不同段落,需要明显停顿或语调转换 |
||||||
|
|
||||||
|
**重要标注要求(请严格遵循):** |
||||||
|
- 如果整个文本内容都在讲同一个事,你有理由只输出一段,不是追求分的段越多越细就越好 |
||||||
|
- 每个分段必须保持原始语句的绝对顺序 |
||||||
|
- 最终分段数可能等于或小于原始语句数量 |
||||||
|
- 必须保留所有原始语句文本,不得遗漏任何内容 |
||||||
|
- 应客户强烈要求,他们需要的是较粗的分段,不要太细,如同一条通告,不需要分段成具体的每个条款之类的,只需要将整个相同的通告分成一段 |
||||||
|
- 优先考虑较粗的分段,避免过度细分 |
||||||
|
|
||||||
|
**广播分段判断依据(偏向粗分段):** |
||||||
|
1. **重大主题转换**:从一个完全不同的话题转向另一个话题(如从天气预报转向安全通知) |
||||||
|
2. **文档类型变化**:从一个完整文档转向另一个完整文档(如从禁火令转向倡议书) |
||||||
|
3. **内容性质变化**:从通知类内容转向完全不同性质的内容(如从法规转向天气预报) |
||||||
|
4. **广播节目段落**:明显的广播节目结构变化(如开场白结束进入正式内容) |
||||||
|
5. **分点阐述结构**:标题和所有分点条目内容应该合并为一个完整段落(如"森林防火十不准,一不乱扔烟头,二不随意丢弃火种,三不在林区吸烟"等整体合成一段) |
||||||
|
|
||||||
|
**广播内容特别注意(粗分段原则):** |
||||||
|
- 整个通告、法令、倡议书等应作为一个段落,不要拆分条款 |
||||||
|
- 同一主题的多个条款应保持在同一段落 |
||||||
|
- 只有在完全不同的文档或重大主题转换时才分段 |
||||||
|
- 广播开场白可以独立成段,但具体内容尽量合并 |
||||||
|
- 同一类型的预报信息(如天气预报的不同地区)应保持在同一段 |
||||||
|
- **分点阐述内容的特殊处理**: |
||||||
|
- 标题性内容(如"森林防火十不准")与分点条目内容之间不需要分段 |
||||||
|
- 标题和所有的分点条目(如"一不乱扔烟头"、"二不随意丢弃火种"、"三不在林区吸烟"等)应该合并为一个完整段落 |
||||||
|
- 分点条目之间不需要分段,应该连续播报 |
||||||
|
- 整个分点阐述结构作为一个完整的内容单元,保持连贯性 |
||||||
|
|
||||||
|
**批量标注说明:** |
||||||
|
- 每个句子对都有一个source_id,表示来源文档 |
||||||
|
- 请保持原有的source_id不变 |
||||||
|
- 将label从-1改为实际的标注结果(0或1) |
||||||
|
- 为每个句子对提供简要的分段理由 |
||||||
|
- 结合上述完整文本内容理解句子对的上下文语境 |
||||||
|
- **特别重要:倾向于标注更多的0(同段落),减少1(分段)的使用,分点阐述结构应保持为一个完整段落** |
||||||
|
|
||||||
|
现在请对以下句子对进行批量标注: |
||||||
|
|
||||||
|
{batch_sentence_pairs} |
||||||
|
|
||||||
|
请直接输出标注结果,格式如下: |
||||||
|
```json |
||||||
|
[ |
||||||
|
{{ |
||||||
|
"sentence1": "...", |
||||||
|
"sentence2": "...", |
||||||
|
"label": 0或1, |
||||||
|
"reason": "广播分段理由", |
||||||
|
"source_id": 原有的source_id |
||||||
|
}} |
||||||
|
] |
||||||
|
``` |
||||||
|
|
||||||
|
只输出JSON数据,不要其他说明文字。""" |
||||||
|
|
||||||
|
|
||||||
|
def load_context_data(csv_file="batch_deduplication_results_619-1103_01.csv"): |
||||||
|
""" |
||||||
|
从batch_deduplication_results_619-1103_01.csv加载上下文数据 |
||||||
|
|
||||||
|
Args: |
||||||
|
csv_file: CSV文件路径 |
||||||
|
|
||||||
|
Returns: |
||||||
|
字典,key为id,value为final_processed_text |
||||||
|
""" |
||||||
|
try: |
||||||
|
print(f"正在读取上下文数据文件: {csv_file}") |
||||||
|
|
||||||
|
# 尝试不同的编码格式 |
||||||
|
encodings = ['utf-8', 'gbk', 'gb2312', 'utf-8-sig', 'latin-1', 'cp1252'] |
||||||
|
context_df = None |
||||||
|
|
||||||
|
for encoding in encodings: |
||||||
|
try: |
||||||
|
print(f" 尝试使用 {encoding} 编码...") |
||||||
|
context_df = pd.read_csv(csv_file, encoding=encoding) |
||||||
|
print(f" ✓ 成功使用 {encoding} 编码读取文件") |
||||||
|
break |
||||||
|
except UnicodeDecodeError: |
||||||
|
print(f" {encoding} 编码失败") |
||||||
|
continue |
||||||
|
except Exception as e: |
||||||
|
print(f" {encoding} 编码读取时出现其他错误:{str(e)}") |
||||||
|
continue |
||||||
|
|
||||||
|
if context_df is None: |
||||||
|
print(f"✗ 错误:尝试了所有编码格式都无法读取文件 {csv_file}") |
||||||
|
return {} |
||||||
|
|
||||||
|
if 'id' not in context_df.columns or 'final_processed_text' not in context_df.columns: |
||||||
|
print(f"✗ 错误:CSV文件缺少必需列") |
||||||
|
print(f" 需要的列: ['id', 'final_processed_text']") |
||||||
|
print(f" 实际的列: {list(context_df.columns)}") |
||||||
|
return {} |
||||||
|
|
||||||
|
# 创建id到final_processed_text的映射 |
||||||
|
context_dict = {} |
||||||
|
for _, row in context_df.iterrows(): |
||||||
|
context_dict[row['id']] = row['final_processed_text'] if pd.notna(row['final_processed_text']) else "" |
||||||
|
|
||||||
|
print(f"✓ 成功加载上下文数据") |
||||||
|
print(f" - 可用ID数量: {len(context_dict)}") |
||||||
|
print(f" - 可用ID列表: {sorted(context_dict.keys())}") |
||||||
|
|
||||||
|
return context_dict |
||||||
|
|
||||||
|
except FileNotFoundError: |
||||||
|
print(f"✗ 警告:找不到上下文文件 {csv_file}") |
||||||
|
print(" 将在没有上下文的情况下进行标注") |
||||||
|
return {} |
||||||
|
except Exception as e: |
||||||
|
print(f"✗ 读取上下文文件时出错: {str(e)}") |
||||||
|
print(" 将在没有上下文的情况下进行标注") |
||||||
|
return {} |
||||||
|
|
||||||
|
|
||||||
|
def load_failed_data_from_json(json_file="segmentation_results_from_7.json"): |
||||||
|
""" |
||||||
|
从JSON结果文件中加载标注失败的数据 |
||||||
|
|
||||||
|
Args: |
||||||
|
json_file: JSON结果文件路径 |
||||||
|
|
||||||
|
Returns: |
||||||
|
失败数据列表 |
||||||
|
""" |
||||||
|
try: |
||||||
|
print(f"正在读取JSON结果文件: {json_file}") |
||||||
|
with open(json_file, 'r', encoding='utf-8') as f: |
||||||
|
all_results = json.load(f) |
||||||
|
|
||||||
|
print(f"✓ 成功加载JSON文件") |
||||||
|
print(f" - 总结果数量: {len(all_results)}") |
||||||
|
|
||||||
|
# 筛选出失败的数据(label为-1) |
||||||
|
failed_data = [item for item in all_results if item.get('label') == -1] |
||||||
|
successful_data = [item for item in all_results if item.get('label') in [0, 1]] |
||||||
|
|
||||||
|
print(f" - 成功标注数量: {len(successful_data)}") |
||||||
|
print(f" - 失败标注数量: {len(failed_data)}") |
||||||
|
|
||||||
|
if len(failed_data) == 0: |
||||||
|
print("✓ 没有发现失败的标注数据,无需重新处理") |
||||||
|
return [], all_results |
||||||
|
|
||||||
|
# 统计失败原因 |
||||||
|
from collections import Counter |
||||||
|
failure_reasons = Counter(item.get('reason', '未知错误') for item in failed_data) |
||||||
|
print(f"\n失败原因统计:") |
||||||
|
for reason, count in failure_reasons.most_common(): |
||||||
|
print(f" - {reason}: {count}次") |
||||||
|
|
||||||
|
# 统计涉及的source_id |
||||||
|
failed_source_ids = sorted(set(item.get('source_id') for item in failed_data)) |
||||||
|
print(f"\n涉及的source_id: {failed_source_ids}") |
||||||
|
|
||||||
|
return failed_data, all_results |
||||||
|
|
||||||
|
except FileNotFoundError: |
||||||
|
print(f"✗ 错误:找不到文件 {json_file}") |
||||||
|
return [], [] |
||||||
|
except json.JSONDecodeError as e: |
||||||
|
print(f"✗ 错误:JSON文件格式错误 - {str(e)}") |
||||||
|
return [], [] |
||||||
|
except Exception as e: |
||||||
|
print(f"✗ 错误:读取JSON文件时出现异常 - {str(e)}") |
||||||
|
return [], [] |
||||||
|
|
||||||
|
|
||||||
|
def convert_failed_data_to_sentence_pairs(failed_data): |
||||||
|
""" |
||||||
|
将失败数据转换为句子对格式,供重新标注使用 |
||||||
|
|
||||||
|
Args: |
||||||
|
failed_data: 失败数据列表 |
||||||
|
|
||||||
|
Returns: |
||||||
|
句子对格式的数据列表 |
||||||
|
""" |
||||||
|
sentence_pairs_data = [] |
||||||
|
|
||||||
|
for item in failed_data: |
||||||
|
sentence_pair = { |
||||||
|
"sentence1": item.get("sentence1", ""), |
||||||
|
"sentence2": item.get("sentence2", ""), |
||||||
|
"source_id": item.get("source_id"), |
||||||
|
"label": -1 # 标记为待标注 |
||||||
|
} |
||||||
|
sentence_pairs_data.append(sentence_pair) |
||||||
|
|
||||||
|
return sentence_pairs_data |
||||||
|
|
||||||
|
|
||||||
|
def process_batch_segmentation(sentence_pairs_data, context_dict, batch_size=8): |
||||||
|
""" |
||||||
|
批量处理句子对的段落分割标注 |
||||||
|
|
||||||
|
Args: |
||||||
|
sentence_pairs_data: 句子对数据列表 |
||||||
|
context_dict: 上下文数据字典 |
||||||
|
batch_size: 每批处理的数量 |
||||||
|
|
||||||
|
Returns: |
||||||
|
处理结果列表 |
||||||
|
""" |
||||||
|
all_results = [] |
||||||
|
total_pairs = len(sentence_pairs_data) |
||||||
|
|
||||||
|
print(f"开始批量标注,总共 {total_pairs} 个句子对") |
||||||
|
print(f"每批处理 {batch_size} 个句子对") |
||||||
|
|
||||||
|
# 分批处理 |
||||||
|
for i in range(0, total_pairs, batch_size): |
||||||
|
batch_end = min(i + batch_size, total_pairs) |
||||||
|
current_batch = sentence_pairs_data[i:batch_end] |
||||||
|
|
||||||
|
print(f"\n处理第 {i // batch_size + 1} 批 (句子对 {i + 1}-{batch_end})") |
||||||
|
|
||||||
|
try: |
||||||
|
# 获取当前批次涉及的source_id的上下文 |
||||||
|
source_ids_in_batch = set(pair['source_id'] for pair in current_batch) |
||||||
|
context_text = "" |
||||||
|
|
||||||
|
for source_id in sorted(source_ids_in_batch): |
||||||
|
if source_id in context_dict and context_dict[source_id]: |
||||||
|
context_text += f"\n--- Source ID {source_id} 完整文本内容 ---\n" |
||||||
|
context_text += context_dict[source_id] # 完整内容,不截断 |
||||||
|
context_text += "\n" |
||||||
|
else: |
||||||
|
context_text += f"\n--- Source ID {source_id} ---\n(未找到对应的完整文本内容)\n" |
||||||
|
|
||||||
|
# 准备当前批次的数据 |
||||||
|
batch_json = json.dumps(current_batch, ensure_ascii=False, indent=2) |
||||||
|
|
||||||
|
# 构建prompt |
||||||
|
prompt = BATCH_SEGMENTATION_PROMPT.format( |
||||||
|
context_text=context_text, |
||||||
|
batch_sentence_pairs=batch_json |
||||||
|
) |
||||||
|
|
||||||
|
print(f"发送请求到AI模型...") |
||||||
|
print(f" - 涉及source_id: {sorted(source_ids_in_batch)}") |
||||||
|
print(f" - 上下文长度: {len(context_text)} 字符") |
||||||
|
print(f" - Prompt总长度: {len(prompt)} 字符") |
||||||
|
|
||||||
|
# 调用AI模型 |
||||||
|
ai_response = client.chat(prompt) |
||||||
|
|
||||||
|
print(f"收到模型响应") |
||||||
|
|
||||||
|
# 尝试解析JSON响应 |
||||||
|
try: |
||||||
|
# 提取JSON部分(去除可能的markdown格式) |
||||||
|
json_start = ai_response.find('[') |
||||||
|
json_end = ai_response.rfind(']') + 1 |
||||||
|
|
||||||
|
if json_start != -1 and json_end != 0: |
||||||
|
json_content = ai_response[json_start:json_end] |
||||||
|
batch_results = json.loads(json_content) |
||||||
|
|
||||||
|
# 验证结果 |
||||||
|
if isinstance(batch_results, list) and len(batch_results) == len(current_batch): |
||||||
|
all_results.extend(batch_results) |
||||||
|
print(f"✓ 成功处理 {len(batch_results)} 个句子对") |
||||||
|
else: |
||||||
|
print( |
||||||
|
f"✗ 响应格式不正确,期望 {len(current_batch)} 个结果,实际得到 {len(batch_results) if isinstance(batch_results, list) else 'non-list'}") |
||||||
|
# 添加错误记录 |
||||||
|
for j, pair in enumerate(current_batch): |
||||||
|
all_results.append({ |
||||||
|
"sentence1": pair["sentence1"], |
||||||
|
"sentence2": pair["sentence2"], |
||||||
|
"label": -1, |
||||||
|
"reason": "重试失败:响应格式错误", |
||||||
|
"source_id": pair["source_id"] |
||||||
|
}) |
||||||
|
else: |
||||||
|
print(f"✗ 无法找到有效的JSON响应") |
||||||
|
print(f"原始响应前200字符: {ai_response[:200]}...") |
||||||
|
# 添加错误记录 |
||||||
|
for j, pair in enumerate(current_batch): |
||||||
|
all_results.append({ |
||||||
|
"sentence1": pair["sentence1"], |
||||||
|
"sentence2": pair["sentence2"], |
||||||
|
"label": -1, |
||||||
|
"reason": "重试失败:JSON解析错误", |
||||||
|
"source_id": pair["source_id"] |
||||||
|
}) |
||||||
|
|
||||||
|
except json.JSONDecodeError as e: |
||||||
|
print(f"✗ JSON解析失败: {str(e)}") |
||||||
|
print(f"原始响应: {ai_response[:200]}...") |
||||||
|
|
||||||
|
# 添加错误记录 |
||||||
|
for j, pair in enumerate(current_batch): |
||||||
|
all_results.append({ |
||||||
|
"sentence1": pair["sentence1"], |
||||||
|
"sentence2": pair["sentence2"], |
||||||
|
"label": -1, |
||||||
|
"reason": f"重试失败:{str(e)}", |
||||||
|
"source_id": pair["source_id"] |
||||||
|
}) |
||||||
|
|
||||||
|
# 添加延时,避免API调用过于频繁 |
||||||
|
time.sleep(2) |
||||||
|
|
||||||
|
except Exception as e: |
||||||
|
print(f"✗ 批次处理出错: {str(e)}") |
||||||
|
|
||||||
|
# 添加错误记录 |
||||||
|
for j, pair in enumerate(current_batch): |
||||||
|
all_results.append({ |
||||||
|
"sentence1": pair["sentence1"], |
||||||
|
"sentence2": pair["sentence2"], |
||||||
|
"label": -1, |
||||||
|
"reason": f"重试异常:{str(e)}", |
||||||
|
"source_id": pair["source_id"] |
||||||
|
}) |
||||||
|
|
||||||
|
return all_results |
||||||
|
|
||||||
|
|
||||||
|
def merge_results(original_results, retry_results): |
||||||
|
""" |
||||||
|
合并原始结果和重试结果 |
||||||
|
|
||||||
|
Args: |
||||||
|
original_results: 原始完整结果列表 |
||||||
|
retry_results: 重试结果列表 |
||||||
|
|
||||||
|
Returns: |
||||||
|
合并后的完整结果列表 |
||||||
|
""" |
||||||
|
print("正在合并结果...") |
||||||
|
|
||||||
|
# 创建重试结果的映射,用于快速查找 |
||||||
|
retry_map = {} |
||||||
|
for result in retry_results: |
||||||
|
key = (result['sentence1'], result['sentence2'], result['source_id']) |
||||||
|
retry_map[key] = result |
||||||
|
|
||||||
|
merged_results = [] |
||||||
|
replaced_count = 0 |
||||||
|
|
||||||
|
for original_result in original_results: |
||||||
|
key = (original_result['sentence1'], original_result['sentence2'], original_result['source_id']) |
||||||
|
|
||||||
|
# 如果原始结果是失败的,并且重试结果存在,则用重试结果替换 |
||||||
|
if original_result.get('label') == -1 and key in retry_map: |
||||||
|
merged_results.append(retry_map[key]) |
||||||
|
replaced_count += 1 |
||||||
|
else: |
||||||
|
merged_results.append(original_result) |
||||||
|
|
||||||
|
print(f"✓ 合并完成,替换了 {replaced_count} 个失败结果") |
||||||
|
return merged_results |
||||||
|
|
||||||
|
|
||||||
|
# 执行重新标注失败数据 |
||||||
|
print("=" * 60) |
||||||
|
print("开始重新标注失败数据") |
||||||
|
print("=" * 60) |
||||||
|
|
||||||
|
# 加载上下文数据 |
||||||
|
context_dict = load_context_data("batch_deduplication_results_619-1103_01.csv") |
||||||
|
|
||||||
|
# 从JSON文件加载失败数据 |
||||||
|
failed_data, original_results = load_failed_data_from_json("segmentation_results_from_7.json") |
||||||
|
|
||||||
|
if len(failed_data) == 0: |
||||||
|
print("没有需要重新标注的数据,程序结束。") |
||||||
|
exit() |
||||||
|
|
||||||
|
print(f"\n开始重新标注 {len(failed_data)} 个失败的句子对...") |
||||||
|
|
||||||
|
# 将失败数据转换为句子对格式 |
||||||
|
retry_sentence_pairs = convert_failed_data_to_sentence_pairs(failed_data) |
||||||
|
|
||||||
|
# 按source_id分组处理(与原始代码保持一致的处理方式) |
||||||
|
from collections import defaultdict |
||||||
|
|
||||||
|
grouped_by_source_id = defaultdict(list) |
||||||
|
for pair in retry_sentence_pairs: |
||||||
|
grouped_by_source_id[pair['source_id']].append(pair) |
||||||
|
|
||||||
|
print(f"失败数据涉及 {len(grouped_by_source_id)} 个source_id") |
||||||
|
|
||||||
|
# 处理每个source_id的失败数据 |
||||||
|
all_retry_results = [] |
||||||
|
|
||||||
|
for source_id in sorted(grouped_by_source_id.keys()): |
||||||
|
current_sentence_pairs = grouped_by_source_id[source_id] |
||||||
|
|
||||||
|
print(f"\n{'=' * 50}") |
||||||
|
print(f"重新处理 source_id: {source_id}") |
||||||
|
print(f"{'=' * 50}") |
||||||
|
print(f" - 该source_id的失败句子对数量: {len(current_sentence_pairs)}") |
||||||
|
|
||||||
|
# 显示第一个句子对预览 |
||||||
|
if len(current_sentence_pairs) > 0: |
||||||
|
first_pair = current_sentence_pairs[0] |
||||||
|
print(f" - 第一个句子对预览:") |
||||||
|
print(f" 句子1: {first_pair['sentence1'][:60]}...") |
||||||
|
print(f" 句子2: {first_pair['sentence2'][:60]}...") |
||||||
|
|
||||||
|
# 检查上下文数据 |
||||||
|
if source_id in context_dict and context_dict[source_id]: |
||||||
|
print(f" - 上下文数据: 可用,长度 {len(context_dict[source_id])} 字符") |
||||||
|
print(f" - 上下文预览: {context_dict[source_id][:100]}...") |
||||||
|
else: |
||||||
|
print(f" - 上下文数据: 不可用") |
||||||
|
|
||||||
|
try: |
||||||
|
print(f" - 开始重新标注...") |
||||||
|
|
||||||
|
# 执行批量标注(batch_size=8) |
||||||
|
current_results = process_batch_segmentation(current_sentence_pairs, context_dict, batch_size=8) |
||||||
|
|
||||||
|
# 添加到总结果中 |
||||||
|
all_retry_results.extend(current_results) |
||||||
|
|
||||||
|
# 统计当前source_id的结果 |
||||||
|
current_successful = len([r for r in current_results if r['label'] in [0, 1]]) |
||||||
|
current_failed = len([r for r in current_results if r['label'] == -1]) |
||||||
|
|
||||||
|
print(f" ✓ source_id {source_id} 重新标注完成") |
||||||
|
print(f" - 成功标注: {current_successful}") |
||||||
|
print(f" - 仍然失败: {current_failed}") |
||||||
|
|
||||||
|
if current_successful > 0: |
||||||
|
current_label_0 = len([r for r in current_results if r['label'] == 0]) |
||||||
|
current_label_1 = len([r for r in current_results if r['label'] == 1]) |
||||||
|
print(f" - 标签0(同段落): {current_label_0}") |
||||||
|
print(f" - 标签1(分段): {current_label_1}") |
||||||
|
|
||||||
|
# 添加延时 |
||||||
|
time.sleep(1) |
||||||
|
|
||||||
|
except Exception as e: |
||||||
|
print(f" ✗ source_id {source_id} 重新标注失败: {str(e)}") |
||||||
|
|
||||||
|
# 为失败的source_id添加错误记录 |
||||||
|
for pair in current_sentence_pairs: |
||||||
|
all_retry_results.append({ |
||||||
|
"sentence1": pair["sentence1"], |
||||||
|
"sentence2": pair["sentence2"], |
||||||
|
"label": -1, |
||||||
|
"reason": f"source_id重试异常:{str(e)}", |
||||||
|
"source_id": pair["source_id"] |
||||||
|
}) |
||||||
|
|
||||||
|
print(f"\n{'=' * 60}") |
||||||
|
print("重新标注完成!") |
||||||
|
print(f"{'=' * 60}") |
||||||
|
|
||||||
|
# 统计重试结果 |
||||||
|
retry_successful = len([r for r in all_retry_results if r['label'] in [0, 1]]) |
||||||
|
retry_failed = len([r for r in all_retry_results if r['label'] == -1]) |
||||||
|
|
||||||
|
print(f"重试结果统计:") |
||||||
|
print(f" - 总重试数量: {len(all_retry_results)}") |
||||||
|
print(f" - 重试成功: {retry_successful}") |
||||||
|
print(f" - 重试仍失败: {retry_failed}") |
||||||
|
|
||||||
|
if retry_successful > 0: |
||||||
|
retry_label_0 = len([r for r in all_retry_results if r['label'] == 0]) |
||||||
|
retry_label_1 = len([r for r in all_retry_results if r['label'] == 1]) |
||||||
|
print(f" - 标签0(同段落): {retry_label_0}") |
||||||
|
print(f" - 标签1(分段): {retry_label_1}") |
||||||
|
|
||||||
|
# 合并原始结果和重试结果 |
||||||
|
final_results = merge_results(original_results, all_retry_results) |
||||||
|
|
||||||
|
# 统计最终结果 |
||||||
|
final_successful = len([r for r in final_results if r['label'] in [0, 1]]) |
||||||
|
final_failed = len([r for r in final_results if r['label'] == -1]) |
||||||
|
|
||||||
|
print(f"\n最终结果统计:") |
||||||
|
print(f" - 总数据量: {len(final_results)}") |
||||||
|
print(f" - 成功标注: {final_successful}") |
||||||
|
print(f" - 失败标注: {final_failed}") |
||||||
|
print(f" - 成功率: {final_successful / len(final_results) * 100:.2f}%") |
||||||
|
|
||||||
|
# 保存合并后的结果 |
||||||
|
final_result_df = pd.DataFrame(final_results) |
||||||
|
final_csv_file = "segmentation_results_from_7_retried.csv" |
||||||
|
final_result_df.to_csv(final_csv_file, index=False, encoding='utf-8-sig') |
||||||
|
|
||||||
|
print(f"\n最终结果已保存到: {final_csv_file}") |
||||||
|
|
||||||
|
# 保存详细的JSON结果 |
||||||
|
final_json_file = 'segmentation_results_from_7_retried.json' |
||||||
|
with open(final_json_file, 'w', encoding='utf-8') as f: |
||||||
|
json.dump(final_results, f, ensure_ascii=False, indent=2) |
||||||
|
|
||||||
|
print(f"详细JSON结果已保存到: {final_json_file}") |
||||||
|
|
||||||
|
# 显示重试前后对比 |
||||||
|
print(f"\n重试前后对比:") |
||||||
|
original_successful = len([r for r in original_results if r['label'] in [0, 1]]) |
||||||
|
original_failed = len([r for r in original_results if r['label'] == -1]) |
||||||
|
print(f" - 重试前成功率: {original_successful / len(original_results) * 100:.2f}%") |
||||||
|
print(f" - 重试后成功率: {final_successful / len(final_results) * 100:.2f}%") |
||||||
|
print(f" - 成功率提升: {(final_successful - original_successful) / len(final_results) * 100:.2f}%") |
||||||
|
|
||||||
|
# 显示前几条重试成功的结果 |
||||||
|
successful_retries = [r for r in all_retry_results if r['label'] in [0, 1]] |
||||||
|
if len(successful_retries) > 0: |
||||||
|
print(f"\n前3条重试成功的结果预览:") |
||||||
|
for i, result in enumerate(successful_retries[:3]): |
||||||
|
print(f"\n{i + 1}. Source ID: {result['source_id']}") |
||||||
|
print(f" 句子1: {result['sentence1'][:50]}...") |
||||||
|
print(f" 句子2: {result['sentence2'][:50]}...") |
||||||
|
print(f" 标签: {result['label']}") |
||||||
|
print(f" 理由: {result['reason']}") |
||||||
@ -0,0 +1,192 @@ |
|||||||
|
import json |
||||||
|
from collections import defaultdict |
||||||
|
|
||||||
|
|
||||||
|
def create_cross_document_boundaries(input_file, output_file): |
||||||
|
""" |
||||||
|
创建跨文档边界的句子对数据 |
||||||
|
将source_id n的最后一句与source_id n+1的第一句配对,标签设为1(分段) |
||||||
|
""" |
||||||
|
|
||||||
|
# 读取原始数据 |
||||||
|
with open(input_file, 'r', encoding='utf-8') as f: |
||||||
|
data = json.load(f) |
||||||
|
|
||||||
|
# 按source_id分组数据 |
||||||
|
source_groups = defaultdict(list) |
||||||
|
for item in data: |
||||||
|
source_id = item['source_id'] |
||||||
|
source_groups[source_id].append(item) |
||||||
|
|
||||||
|
# 按source_id排序 |
||||||
|
sorted_source_ids = sorted(source_groups.keys()) |
||||||
|
|
||||||
|
# 存储新创建的跨文档边界数据 |
||||||
|
cross_boundary_data = [] |
||||||
|
|
||||||
|
print(f"处理 {len(sorted_source_ids)} 个source_id...") |
||||||
|
|
||||||
|
# 遍历相邻的source_id |
||||||
|
for i in range(len(sorted_source_ids) - 1): |
||||||
|
current_source_id = sorted_source_ids[i] |
||||||
|
next_source_id = sorted_source_ids[i + 1] |
||||||
|
|
||||||
|
current_group = source_groups[current_source_id] |
||||||
|
next_group = source_groups[next_source_id] |
||||||
|
|
||||||
|
if len(current_group) == 0 or len(next_group) == 0: |
||||||
|
continue |
||||||
|
|
||||||
|
# 获取当前source_id的最后一个句子对的sentence2 |
||||||
|
last_item = current_group[-1] |
||||||
|
last_sentence = last_item['sentence2'] |
||||||
|
|
||||||
|
# 获取下一个source_id的第一个句子对的sentence1 |
||||||
|
first_item = next_group[0] |
||||||
|
first_sentence = first_item['sentence1'] |
||||||
|
|
||||||
|
# 创建跨文档边界的句子对 |
||||||
|
cross_boundary_item = { |
||||||
|
"sentence1": last_sentence, |
||||||
|
"sentence2": first_sentence, |
||||||
|
"label": 1, # 跨文档必须分段 |
||||||
|
"reason": f"跨文档边界: source_id {current_source_id} 的结尾与 source_id {next_source_id} 的开头,属于不同文档,必须分段。", |
||||||
|
"source_id": f"{current_source_id}-{next_source_id}", |
||||||
|
"boundary_type": "cross_document" |
||||||
|
} |
||||||
|
|
||||||
|
cross_boundary_data.append(cross_boundary_item) |
||||||
|
|
||||||
|
print(f"创建跨界边界: {current_source_id} -> {next_source_id}") |
||||||
|
print(f" 句子1: {last_sentence[:50]}...") |
||||||
|
print(f" 句子2: {first_sentence[:50]}...") |
||||||
|
|
||||||
|
print(f"\n总共创建了 {len(cross_boundary_data)} 个跨文档边界样本") |
||||||
|
|
||||||
|
# 保存跨文档边界数据 |
||||||
|
with open(output_file, 'w', encoding='utf-8') as f: |
||||||
|
json.dump(cross_boundary_data, f, ensure_ascii=False, indent=2) |
||||||
|
|
||||||
|
print(f"跨文档边界数据已保存到: {output_file}") |
||||||
|
|
||||||
|
return cross_boundary_data |
||||||
|
|
||||||
|
|
||||||
|
def merge_with_original_data(original_file, cross_boundary_file, merged_output_file): |
||||||
|
""" |
||||||
|
将跨文档边界数据与原始数据合并 |
||||||
|
""" |
||||||
|
|
||||||
|
# 读取原始数据 |
||||||
|
with open(original_file, 'r', encoding='utf-8') as f: |
||||||
|
original_data = json.load(f) |
||||||
|
|
||||||
|
# 读取跨文档边界数据 |
||||||
|
with open(cross_boundary_file, 'r', encoding='utf-8') as f: |
||||||
|
cross_boundary_data = json.load(f) |
||||||
|
|
||||||
|
# 合并数据 |
||||||
|
merged_data = original_data + cross_boundary_data |
||||||
|
|
||||||
|
print(f"原始数据: {len(original_data)} 条") |
||||||
|
print(f"跨文档边界数据: {len(cross_boundary_data)} 条") |
||||||
|
print(f"合并后数据: {len(merged_data)} 条") |
||||||
|
|
||||||
|
# 统计标签分布 |
||||||
|
label_counts = {} |
||||||
|
for item in merged_data: |
||||||
|
label = item['label'] |
||||||
|
label_counts[label] = label_counts.get(label, 0) + 1 |
||||||
|
|
||||||
|
print(f"\n合并后标签分布:") |
||||||
|
for label, count in label_counts.items(): |
||||||
|
label_name = "不分段" if label == 0 else "分段" |
||||||
|
percentage = count / len(merged_data) * 100 |
||||||
|
print(f" 标签 {label} ({label_name}): {count} 条 ({percentage:.1f}%)") |
||||||
|
|
||||||
|
# 保存合并数据 |
||||||
|
with open(merged_output_file, 'w', encoding='utf-8') as f: |
||||||
|
json.dump(merged_data, f, ensure_ascii=False, indent=2) |
||||||
|
|
||||||
|
print(f"\n合并数据已保存到: {merged_output_file}") |
||||||
|
|
||||||
|
return merged_data |
||||||
|
|
||||||
|
|
||||||
|
def analyze_source_structure(input_file): |
||||||
|
""" |
||||||
|
分析source_id的结构,便于理解数据 |
||||||
|
""" |
||||||
|
|
||||||
|
with open(input_file, 'r', encoding='utf-8') as f: |
||||||
|
data = json.load(f) |
||||||
|
|
||||||
|
# 按source_id分组 |
||||||
|
source_groups = defaultdict(list) |
||||||
|
for item in data: |
||||||
|
source_id = item['source_id'] |
||||||
|
source_groups[source_id].append(item) |
||||||
|
|
||||||
|
print(f"数据结构分析:") |
||||||
|
print(f"总共 {len(data)} 个句子对") |
||||||
|
print(f"涉及 {len(source_groups)} 个source_id") |
||||||
|
print(f"source_id范围: {min(source_groups.keys())} - {max(source_groups.keys())}") |
||||||
|
|
||||||
|
# 显示每个source_id的句子对数量 |
||||||
|
print(f"\n各source_id的句子对数量:") |
||||||
|
sorted_source_ids = sorted(source_groups.keys()) |
||||||
|
for source_id in sorted_source_ids: |
||||||
|
count = len(source_groups[source_id]) |
||||||
|
print(f" source_id {source_id}: {count} 个句子对") |
||||||
|
|
||||||
|
# 显示前几个source_id的示例 |
||||||
|
print(f"\n前3个source_id的示例:") |
||||||
|
for source_id in sorted_source_ids[:3]: |
||||||
|
items = source_groups[source_id] |
||||||
|
print(f"\nsource_id {source_id}:") |
||||||
|
print(f" 第一个句子对: {items[0]['sentence1'][:30]}... -> {items[0]['sentence2'][:30]}...") |
||||||
|
if len(items) > 1: |
||||||
|
print(f" 最后一个句子对: {items[-1]['sentence1'][:30]}... -> {items[-1]['sentence2'][:30]}...") |
||||||
|
|
||||||
|
|
||||||
|
def main(): |
||||||
|
""" |
||||||
|
主函数 - 处理跨文档边界数据 |
||||||
|
""" |
||||||
|
|
||||||
|
# 文件路径 |
||||||
|
input_file = "segmentation_results_from_7_retried.json" |
||||||
|
cross_boundary_output = "cross_document_boundaries.json" |
||||||
|
merged_output = "enhanced_training_data_with_boundaries.json" |
||||||
|
|
||||||
|
print("=" * 60) |
||||||
|
print("跨文档边界数据生成") |
||||||
|
print("=" * 60) |
||||||
|
|
||||||
|
# 1. 分析原始数据结构 |
||||||
|
print("1. 分析原始数据结构...") |
||||||
|
analyze_source_structure(input_file) |
||||||
|
|
||||||
|
print("\n" + "=" * 60) |
||||||
|
|
||||||
|
# 2. 创建跨文档边界数据 |
||||||
|
print("2. 创建跨文档边界数据...") |
||||||
|
cross_boundary_data = create_cross_document_boundaries(input_file, cross_boundary_output) |
||||||
|
|
||||||
|
print("\n" + "=" * 60) |
||||||
|
|
||||||
|
# 3. 合并数据 |
||||||
|
print("3. 合并原始数据与跨文档边界数据...") |
||||||
|
merged_data = merge_with_original_data(input_file, cross_boundary_output, merged_output) |
||||||
|
|
||||||
|
print("\n" + "=" * 60) |
||||||
|
print("处理完成!") |
||||||
|
print("=" * 60) |
||||||
|
print(f"生成的文件:") |
||||||
|
print(f" - 跨文档边界数据: {cross_boundary_output}") |
||||||
|
print(f" - 增强训练数据: {merged_output}") |
||||||
|
print(f"\n现在可以使用 {merged_output} 进行模型训练") |
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__": |
||||||
|
main() |
||||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,542 @@ |
|||||||
|
=== 批量多级别去重详细报告 === |
||||||
|
|
||||||
|
处理日期: 2025-08-11 17:04:28 |
||||||
|
总共处理: 50 个ID |
||||||
|
成功处理: 50 个ID |
||||||
|
|
||||||
|
总体统计: |
||||||
|
- 平均压缩比: 24.59% |
||||||
|
- 总原始字符数: 108,025 |
||||||
|
- 总最终字符数: 57,951 |
||||||
|
- 总减少字符数: 50,038 |
||||||
|
|
||||||
|
|
||||||
|
--- ID 1104 详细报告 --- |
||||||
|
原始文本长度: 791 字符 |
||||||
|
最终文本长度: 791 字符 |
||||||
|
总体压缩比: 0.00% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 1 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
|
||||||
|
--- ID 1105 详细报告 --- |
||||||
|
原始文本长度: 791 字符 |
||||||
|
最终文本长度: 791 字符 |
||||||
|
总体压缩比: 0.00% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 1 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
|
||||||
|
--- ID 1106 详细报告 --- |
||||||
|
原始文本长度: 7591 字符 |
||||||
|
最终文本长度: 801 字符 |
||||||
|
总体压缩比: 89.45% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 6791 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 161 项内容 |
||||||
|
|
||||||
|
--- ID 1107 详细报告 --- |
||||||
|
原始文本长度: 19 字符 |
||||||
|
最终文本长度: 19 字符 |
||||||
|
总体压缩比: 0.00% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 1 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
|
||||||
|
--- ID 1108 详细报告 --- |
||||||
|
原始文本长度: 3738 字符 |
||||||
|
最终文本长度: 1248 字符 |
||||||
|
总体压缩比: 66.61% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 2491 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 60 项内容 |
||||||
|
|
||||||
|
--- ID 1109 详细报告 --- |
||||||
|
原始文本长度: 4841 字符 |
||||||
|
最终文本长度: 4841 字符 |
||||||
|
总体压缩比: 0.00% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 1 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
|
||||||
|
--- ID 1110 详细报告 --- |
||||||
|
原始文本长度: 177 字符 |
||||||
|
最终文本长度: 104 字符 |
||||||
|
总体压缩比: 41.24% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 74 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 1 项内容 |
||||||
|
|
||||||
|
--- ID 1111 详细报告 --- |
||||||
|
原始文本长度: 212 字符 |
||||||
|
最终文本长度: 212 字符 |
||||||
|
总体压缩比: 0.00% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 1 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
|
||||||
|
--- ID 1112 详细报告 --- |
||||||
|
原始文本长度: 190 字符 |
||||||
|
最终文本长度: 116 字符 |
||||||
|
总体压缩比: 38.95% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 75 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 1 项内容 |
||||||
|
|
||||||
|
--- ID 1113 详细报告 --- |
||||||
|
原始文本长度: 1282 字符 |
||||||
|
最终文本长度: 1282 字符 |
||||||
|
总体压缩比: 0.00% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 1 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
|
||||||
|
--- ID 1114 详细报告 --- |
||||||
|
原始文本长度: 5262 字符 |
||||||
|
最终文本长度: 5262 字符 |
||||||
|
总体压缩比: 0.00% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 1 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
|
||||||
|
--- ID 1115 详细报告 --- |
||||||
|
原始文本长度: 5328 字符 |
||||||
|
最终文本长度: 2005 字符 |
||||||
|
总体压缩比: 62.37% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 2707 字符 |
||||||
|
2. 句子级去重: 减少 616 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 33 项内容 |
||||||
|
SENTENCES级别移除了 7 项内容 |
||||||
|
|
||||||
|
--- ID 1116 详细报告 --- |
||||||
|
原始文本长度: 5127 字符 |
||||||
|
最终文本长度: 5117 字符 |
||||||
|
总体压缩比: 0.20% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 11 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 1 项内容 |
||||||
|
|
||||||
|
--- ID 1117 详细报告 --- |
||||||
|
原始文本长度: 400 字符 |
||||||
|
最终文本长度: 400 字符 |
||||||
|
总体压缩比: 0.00% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 1 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
|
||||||
|
--- ID 1118 详细报告 --- |
||||||
|
原始文本长度: 1296 字符 |
||||||
|
最终文本长度: 817 字符 |
||||||
|
总体压缩比: 36.96% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 480 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 11 项内容 |
||||||
|
|
||||||
|
--- ID 1119 详细报告 --- |
||||||
|
原始文本长度: 445 字符 |
||||||
|
最终文本长度: 284 字符 |
||||||
|
总体压缩比: 36.18% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 162 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 2 项内容 |
||||||
|
|
||||||
|
--- ID 1120 详细报告 --- |
||||||
|
原始文本长度: 795 字符 |
||||||
|
最终文本长度: 422 字符 |
||||||
|
总体压缩比: 46.92% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 374 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 10 项内容 |
||||||
|
|
||||||
|
--- ID 1121 详细报告 --- |
||||||
|
原始文本长度: 796 字符 |
||||||
|
最终文本长度: 424 字符 |
||||||
|
总体压缩比: 46.73% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 373 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 10 项内容 |
||||||
|
|
||||||
|
--- ID 1122 详细报告 --- |
||||||
|
原始文本长度: 125 字符 |
||||||
|
最终文本长度: 125 字符 |
||||||
|
总体压缩比: 0.00% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 1 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
|
||||||
|
--- ID 1123 详细报告 --- |
||||||
|
原始文本长度: 37 字符 |
||||||
|
最终文本长度: 37 字符 |
||||||
|
总体压缩比: 0.00% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 1 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
|
||||||
|
--- ID 1124 详细报告 --- |
||||||
|
原始文本长度: 3675 字符 |
||||||
|
最终文本长度: 3175 字符 |
||||||
|
总体压缩比: 13.61% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 501 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 14 项内容 |
||||||
|
|
||||||
|
--- ID 1125 详细报告 --- |
||||||
|
原始文本长度: 498 字符 |
||||||
|
最终文本长度: 249 字符 |
||||||
|
总体压缩比: 50.00% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 250 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 1 项内容 |
||||||
|
|
||||||
|
--- ID 1126 详细报告 --- |
||||||
|
原始文本长度: 2461 字符 |
||||||
|
最终文本长度: 486 字符 |
||||||
|
总体压缩比: 80.25% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 1976 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 40 项内容 |
||||||
|
|
||||||
|
--- ID 1127 详细报告 --- |
||||||
|
原始文本长度: 2442 字符 |
||||||
|
最终文本长度: 1120 字符 |
||||||
|
总体压缩比: 54.14% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 1323 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 15 项内容 |
||||||
|
|
||||||
|
--- ID 1128 详细报告 --- |
||||||
|
原始文本长度: 2560 字符 |
||||||
|
最终文本长度: 1779 字符 |
||||||
|
总体压缩比: 30.51% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 782 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 8 项内容 |
||||||
|
|
||||||
|
--- ID 1129 详细报告 --- |
||||||
|
原始文本长度: 2561 字符 |
||||||
|
最终文本长度: 1788 字符 |
||||||
|
总体压缩比: 30.18% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 774 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 7 项内容 |
||||||
|
|
||||||
|
--- ID 1130 详细报告 --- |
||||||
|
原始文本长度: 673 字符 |
||||||
|
最终文本长度: 673 字符 |
||||||
|
总体压缩比: 0.00% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 1 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
|
||||||
|
--- ID 1131 详细报告 --- |
||||||
|
原始文本长度: 264 字符 |
||||||
|
最终文本长度: 264 字符 |
||||||
|
总体压缩比: 0.00% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 1 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
|
||||||
|
--- ID 1132 详细报告 --- |
||||||
|
原始文本长度: 1566 字符 |
||||||
|
最终文本长度: 1442 字符 |
||||||
|
总体压缩比: 7.92% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 125 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 2 项内容 |
||||||
|
|
||||||
|
--- ID 1133 详细报告 --- |
||||||
|
原始文本长度: 1559 字符 |
||||||
|
最终文本长度: 1559 字符 |
||||||
|
总体压缩比: 0.00% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 1 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
|
||||||
|
--- ID 1134 详细报告 --- |
||||||
|
原始文本长度: 2510 字符 |
||||||
|
最终文本长度: 356 字符 |
||||||
|
总体压缩比: 85.82% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 2155 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 141 项内容 |
||||||
|
|
||||||
|
--- ID 1135 详细报告 --- |
||||||
|
原始文本长度: 2530 字符 |
||||||
|
最终文本长度: 380 字符 |
||||||
|
总体压缩比: 84.98% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 2151 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 158 项内容 |
||||||
|
|
||||||
|
--- ID 1136 详细报告 --- |
||||||
|
原始文本长度: 251 字符 |
||||||
|
最终文本长度: 251 字符 |
||||||
|
总体压缩比: 0.00% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 1 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
|
||||||
|
--- ID 1137 详细报告 --- |
||||||
|
原始文本长度: 3153 字符 |
||||||
|
最终文本长度: 571 字符 |
||||||
|
总体压缩比: 81.89% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 2583 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 147 项内容 |
||||||
|
|
||||||
|
--- ID 1138 详细报告 --- |
||||||
|
原始文本长度: 917 字符 |
||||||
|
最终文本长度: 883 字符 |
||||||
|
总体压缩比: 3.71% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 35 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 1 项内容 |
||||||
|
|
||||||
|
--- ID 1139 详细报告 --- |
||||||
|
原始文本长度: 908 字符 |
||||||
|
最终文本长度: 857 字符 |
||||||
|
总体压缩比: 5.62% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 52 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 1 项内容 |
||||||
|
|
||||||
|
--- ID 1140 详细报告 --- |
||||||
|
原始文本长度: 2797 字符 |
||||||
|
最终文本长度: 1656 字符 |
||||||
|
总体压缩比: 40.79% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 1142 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 25 项内容 |
||||||
|
|
||||||
|
--- ID 1141 详细报告 --- |
||||||
|
原始文本长度: 800 字符 |
||||||
|
最终文本长度: 800 字符 |
||||||
|
总体压缩比: 0.00% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 1 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
|
||||||
|
--- ID 1142 详细报告 --- |
||||||
|
原始文本长度: 618 字符 |
||||||
|
最终文本长度: 598 字符 |
||||||
|
总体压缩比: 3.24% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 21 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 1 项内容 |
||||||
|
|
||||||
|
--- ID 1143 详细报告 --- |
||||||
|
原始文本长度: 1330 字符 |
||||||
|
最终文本长度: 732 字符 |
||||||
|
总体压缩比: 44.96% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 599 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 11 项内容 |
||||||
|
|
||||||
|
--- ID 1144 详细报告 --- |
||||||
|
原始文本长度: 22010 字符 |
||||||
|
最终文本长度: 1494 字符 |
||||||
|
总体压缩比: 93.21% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 20517 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 342 项内容 |
||||||
|
|
||||||
|
--- ID 1145 详细报告 --- |
||||||
|
原始文本长度: 42 字符 |
||||||
|
最终文本长度: 42 字符 |
||||||
|
总体压缩比: 0.00% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 1 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
|
||||||
|
--- ID 1146 详细报告 --- |
||||||
|
原始文本长度: 771 字符 |
||||||
|
最终文本长度: 771 字符 |
||||||
|
总体压缩比: 0.00% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 1 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
|
||||||
|
--- ID 1147 详细报告 --- |
||||||
|
原始文本长度: 1183 字符 |
||||||
|
最终文本长度: 1183 字符 |
||||||
|
总体压缩比: 0.00% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 1 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
|
||||||
|
--- ID 1148 详细报告 --- |
||||||
|
原始文本长度: 1184 字符 |
||||||
|
最终文本长度: 1184 字符 |
||||||
|
总体压缩比: 0.00% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 1 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
|
||||||
|
--- ID 1149 详细报告 --- |
||||||
|
原始文本长度: 3964 字符 |
||||||
|
最终文本长度: 3964 字符 |
||||||
|
总体压缩比: 0.00% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 1 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
|
||||||
|
--- ID 1150 详细报告 --- |
||||||
|
原始文本长度: 1263 字符 |
||||||
|
最终文本长度: 1191 字符 |
||||||
|
总体压缩比: 5.70% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 73 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 3 项内容 |
||||||
|
|
||||||
|
--- ID 1151 详细报告 --- |
||||||
|
原始文本长度: 1611 字符 |
||||||
|
最终文本长度: 1524 字符 |
||||||
|
总体压缩比: 5.40% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 88 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 8 项内容 |
||||||
|
|
||||||
|
--- ID 1152 详细报告 --- |
||||||
|
原始文本长度: 1810 字符 |
||||||
|
最终文本长度: 1046 字符 |
||||||
|
总体压缩比: 42.21% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 765 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
|
PARAGRAPHS级别移除了 16 项内容 |
||||||
|
|
||||||
|
--- ID 1153 详细报告 --- |
||||||
|
原始文本长度: 835 字符 |
||||||
|
最终文本长度: 835 字符 |
||||||
|
总体压缩比: 0.00% |
||||||
|
各级别处理效果: |
||||||
|
1. 段落级去重: 减少 1 字符 |
||||||
|
2. 句子级去重: 减少 -1 字符 |
||||||
|
3. 短语级去重: 减少 0 字符 |
||||||
|
4. 最终标点规范化: 减少 0 字符 |
||||||
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,219 @@ |
|||||||
|
import json |
||||||
|
import random |
||||||
|
from collections import defaultdict |
||||||
|
import pandas as pd |
||||||
|
|
||||||
|
|
||||||
|
def split_dataset_by_source_id(input_file, test_size=150, random_seed=42): |
||||||
|
""" |
||||||
|
根据source_id随机划分数据集为训练集和测试集 |
||||||
|
|
||||||
|
Args: |
||||||
|
input_file: 输入的JSON文件路径 |
||||||
|
test_size: 测试集中source_id的数量 |
||||||
|
random_seed: 随机种子,确保结果可重现 |
||||||
|
|
||||||
|
Returns: |
||||||
|
train_data, test_data: 训练集和测试集数据 |
||||||
|
""" |
||||||
|
|
||||||
|
# 设置随机种子 |
||||||
|
random.seed(random_seed) |
||||||
|
|
||||||
|
print(f"正在读取文件: {input_file}") |
||||||
|
|
||||||
|
try: |
||||||
|
# 读取JSON文件 |
||||||
|
with open(input_file, 'r', encoding='utf-8') as f: |
||||||
|
all_data = json.load(f) |
||||||
|
|
||||||
|
print(f"✓ 成功读取文件,总记录数: {len(all_data)}") |
||||||
|
|
||||||
|
# 按source_id分组 |
||||||
|
source_id_groups = defaultdict(list) |
||||||
|
for item in all_data: |
||||||
|
source_id_groups[item['source_id']].append(item) |
||||||
|
|
||||||
|
# 获取所有unique的source_id |
||||||
|
all_source_ids = list(source_id_groups.keys()) |
||||||
|
total_source_ids = len(all_source_ids) |
||||||
|
|
||||||
|
print(f"✓ 发现 {total_source_ids} 个不同的source_id") |
||||||
|
|
||||||
|
# 检查测试集大小是否合理 |
||||||
|
if test_size >= total_source_ids: |
||||||
|
print(f"✗ 错误:测试集大小 ({test_size}) 大于等于总source_id数量 ({total_source_ids})") |
||||||
|
print(f"建议将测试集大小设置为小于 {total_source_ids}") |
||||||
|
return None, None |
||||||
|
|
||||||
|
# 随机选择测试集的source_id |
||||||
|
test_source_ids = random.sample(all_source_ids, test_size) |
||||||
|
train_source_ids = [sid for sid in all_source_ids if sid not in test_source_ids] |
||||||
|
|
||||||
|
print(f"✓ 随机选择了 {len(test_source_ids)} 个source_id作为测试集") |
||||||
|
print(f"✓ 剩余 {len(train_source_ids)} 个source_id作为训练集") |
||||||
|
|
||||||
|
# 构建训练集和测试集 |
||||||
|
train_data = [] |
||||||
|
test_data = [] |
||||||
|
|
||||||
|
for source_id in train_source_ids: |
||||||
|
train_data.extend(source_id_groups[source_id]) |
||||||
|
|
||||||
|
for source_id in test_source_ids: |
||||||
|
test_data.extend(source_id_groups[source_id]) |
||||||
|
|
||||||
|
print(f"\n=== 数据集划分结果 ===") |
||||||
|
print(f"训练集:") |
||||||
|
print(f" - Source ID数量: {len(train_source_ids)}") |
||||||
|
print(f" - 记录数量: {len(train_data)}") |
||||||
|
|
||||||
|
print(f"测试集:") |
||||||
|
print(f" - Source ID数量: {len(test_source_ids)}") |
||||||
|
print(f" - 记录数量: {len(test_data)}") |
||||||
|
|
||||||
|
# 统计标签分布 |
||||||
|
def get_label_distribution(data, dataset_name): |
||||||
|
label_counts = defaultdict(int) |
||||||
|
for item in data: |
||||||
|
label_counts[item['label']] += 1 |
||||||
|
|
||||||
|
print(f"\n{dataset_name}标签分布:") |
||||||
|
for label, count in sorted(label_counts.items()): |
||||||
|
percentage = (count / len(data) * 150) if len(data) > 0 else 0 |
||||||
|
print(f" 标签 {label}: {count} 条 ({percentage:.2f}%)") |
||||||
|
|
||||||
|
return label_counts |
||||||
|
|
||||||
|
train_labels = get_label_distribution(train_data, "训练集") |
||||||
|
test_labels = get_label_distribution(test_data, "测试集") |
||||||
|
|
||||||
|
# 显示选中的source_id |
||||||
|
print(f"\n=== 测试集Source ID列表 ===") |
||||||
|
print(f"测试集source_id: {sorted(test_source_ids)}") |
||||||
|
|
||||||
|
print(f"\n=== 训练集Source ID列表 ===") |
||||||
|
print(f"训练集source_id: {sorted(train_source_ids)}") |
||||||
|
|
||||||
|
return train_data, test_data, train_source_ids, test_source_ids |
||||||
|
|
||||||
|
except FileNotFoundError: |
||||||
|
print(f"✗ 错误:找不到文件 {input_file}") |
||||||
|
return None, None, None, None |
||||||
|
except json.JSONDecodeError as e: |
||||||
|
print(f"✗ 错误:JSON文件格式错误 - {str(e)}") |
||||||
|
return None, None, None, None |
||||||
|
except Exception as e: |
||||||
|
print(f"✗ 错误:处理文件时出现异常 - {str(e)}") |
||||||
|
return None, None, None, None |
||||||
|
|
||||||
|
|
||||||
|
def save_dataset(data, filename, description): |
||||||
|
""" |
||||||
|
保存数据集到JSON文件 |
||||||
|
|
||||||
|
Args: |
||||||
|
data: 要保存的数据 |
||||||
|
filename: 输出文件名 |
||||||
|
description: 数据集描述 |
||||||
|
""" |
||||||
|
try: |
||||||
|
with open(filename, 'w', encoding='utf-8') as f: |
||||||
|
json.dump(data, f, ensure_ascii=False, indent=2) |
||||||
|
print(f"✓ {description}已保存到: {filename}") |
||||||
|
return True |
||||||
|
except Exception as e: |
||||||
|
print(f"✗ 保存{description}时出错: {str(e)}") |
||||||
|
return False |
||||||
|
|
||||||
|
|
||||||
|
def create_summary_report(train_data, test_data, train_source_ids, test_source_ids): |
||||||
|
""" |
||||||
|
创建数据集划分的详细报告 |
||||||
|
""" |
||||||
|
summary = { |
||||||
|
"split_info": { |
||||||
|
"total_source_ids": len(train_source_ids) + len(test_source_ids), |
||||||
|
"train_source_ids": len(train_source_ids), |
||||||
|
"test_source_ids": len(test_source_ids), |
||||||
|
"total_records": len(train_data) + len(test_data), |
||||||
|
"train_records": len(train_data), |
||||||
|
"test_records": len(test_data) |
||||||
|
}, |
||||||
|
"train_source_id_list": sorted(train_source_ids), |
||||||
|
"test_source_id_list": sorted(test_source_ids), |
||||||
|
"label_distribution": { |
||||||
|
"train": {}, |
||||||
|
"test": {} |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
# 计算标签分布 |
||||||
|
for dataset_name, data in [("train", train_data), ("test", test_data)]: |
||||||
|
label_counts = defaultdict(int) |
||||||
|
for item in data: |
||||||
|
label_counts[item['label']] += 1 |
||||||
|
|
||||||
|
summary["label_distribution"][dataset_name] = dict(label_counts) |
||||||
|
|
||||||
|
# 保存报告 |
||||||
|
with open('dataset_split_summary.json', 'w', encoding='utf-8') as f: |
||||||
|
json.dump(summary, f, ensure_ascii=False, indent=2) |
||||||
|
|
||||||
|
print(f"✓ 数据集划分报告已保存到: dataset_split_summary.json") |
||||||
|
|
||||||
|
|
||||||
|
# 主程序执行 |
||||||
|
if __name__ == "__main__": |
||||||
|
print("=" * 60) |
||||||
|
print("数据集划分程序") |
||||||
|
print("=" * 60) |
||||||
|
|
||||||
|
# 输入文件名 |
||||||
|
input_file = "segmentation_results_from_7_retried.json" |
||||||
|
|
||||||
|
# 执行数据集划分 |
||||||
|
train_data, test_data, train_source_ids, test_source_ids = split_dataset_by_source_id( |
||||||
|
input_file=input_file, |
||||||
|
test_size=150, |
||||||
|
random_seed=42 |
||||||
|
) |
||||||
|
|
||||||
|
if train_data is not None and test_data is not None: |
||||||
|
print(f"\n{'=' * 60}") |
||||||
|
print("开始保存数据集文件") |
||||||
|
print(f"{'=' * 60}") |
||||||
|
|
||||||
|
# 保存训练集 |
||||||
|
train_success = save_dataset(train_data, "train_dataset.json", "训练集") |
||||||
|
|
||||||
|
# 保存测试集 |
||||||
|
test_success = save_dataset(test_data, "test_dataset.json", "测试集") |
||||||
|
|
||||||
|
if train_success and test_success: |
||||||
|
# 创建详细报告 |
||||||
|
create_summary_report(train_data, test_data, train_source_ids, test_source_ids) |
||||||
|
|
||||||
|
print(f"\n{'=' * 60}") |
||||||
|
print("数据集划分完成!") |
||||||
|
print(f"{'=' * 60}") |
||||||
|
print("生成的文件:") |
||||||
|
print("1. train_dataset.json - 训练集数据") |
||||||
|
print("2. test_dataset.json - 测试集数据") |
||||||
|
print("3. dataset_split_summary.json - 划分报告") |
||||||
|
|
||||||
|
# 验证数据完整性 |
||||||
|
print(f"\n=== 数据完整性验证 ===") |
||||||
|
original_count = len(train_data) + len(test_data) |
||||||
|
print(f"原始数据总数: {original_count}") |
||||||
|
print(f"训练集 + 测试集: {len(train_data)} + {len(test_data)} = {len(train_data) + len(test_data)}") |
||||||
|
|
||||||
|
if len(train_data) + len(test_data) == original_count: |
||||||
|
print("✓ 数据完整性验证通过") |
||||||
|
else: |
||||||
|
print("✗ 数据完整性验证失败") |
||||||
|
|
||||||
|
else: |
||||||
|
print("✗ 保存文件时出现错误") |
||||||
|
else: |
||||||
|
print("✗ 数据集划分失败") |
||||||
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,344 @@ |
|||||||
|
import json |
||||||
|
import matplotlib.pyplot as plt |
||||||
|
import matplotlib.patches as patches |
||||||
|
from collections import Counter |
||||||
|
import numpy as np |
||||||
|
import warnings |
||||||
|
import re |
||||||
|
|
||||||
|
# 忽略matplotlib警告 |
||||||
|
warnings.filterwarnings('ignore') |
||||||
|
|
||||||
|
# 设置matplotlib后端(避免显示问题) |
||||||
|
import matplotlib |
||||||
|
|
||||||
|
matplotlib.use('Agg') # 使用非交互式后端 |
||||||
|
|
||||||
|
# 设置中文字体支持 |
||||||
|
plt.rcParams['font.sans-serif'] = ['SimHei', 'Arial Unicode MS', 'DejaVu Sans'] |
||||||
|
plt.rcParams['axes.unicode_minus'] = False |
||||||
|
|
||||||
|
|
||||||
|
def diagnose_json_file(file_path): |
||||||
|
""" |
||||||
|
诊断JSON文件的问题 |
||||||
|
|
||||||
|
Args: |
||||||
|
file_path (str): JSON文件路径 |
||||||
|
|
||||||
|
Returns: |
||||||
|
dict: 诊断结果 |
||||||
|
""" |
||||||
|
print(f"正在诊断文件:{file_path}") |
||||||
|
print("=" * 50) |
||||||
|
|
||||||
|
try: |
||||||
|
with open(file_path, 'r', encoding='utf-8') as f: |
||||||
|
content = f.read() |
||||||
|
|
||||||
|
print(f"文件大小:{len(content)} 字符") |
||||||
|
print(f"文件前100个字符:{content[:100]}") |
||||||
|
print(f"文件后100个字符:{content[-100:]}") |
||||||
|
|
||||||
|
# 检查是否为空文件 |
||||||
|
if not content.strip(): |
||||||
|
print("错误:文件为空") |
||||||
|
return {"status": "empty", "content": content} |
||||||
|
|
||||||
|
# 尝试解析JSON |
||||||
|
try: |
||||||
|
data = json.loads(content) |
||||||
|
print("✓ JSON格式正确") |
||||||
|
return {"status": "valid", "data": data} |
||||||
|
except json.JSONDecodeError as e: |
||||||
|
print(f"✗ JSON格式错误:{e}") |
||||||
|
print(f"错误位置:行 {e.lineno}, 列 {e.colno}") |
||||||
|
return {"status": "invalid", "error": str(e), "content": content} |
||||||
|
|
||||||
|
except FileNotFoundError: |
||||||
|
print(f"错误:找不到文件 {file_path}") |
||||||
|
return {"status": "not_found"} |
||||||
|
except Exception as e: |
||||||
|
print(f"读取文件时出错:{e}") |
||||||
|
return {"status": "error", "error": str(e)} |
||||||
|
|
||||||
|
|
||||||
|
def try_fix_json(content): |
||||||
|
""" |
||||||
|
尝试修复常见的JSON格式问题 |
||||||
|
|
||||||
|
Args: |
||||||
|
content (str): 原始文件内容 |
||||||
|
|
||||||
|
Returns: |
||||||
|
list: 修复后的数据,如果修复失败则返回None |
||||||
|
""" |
||||||
|
print("\n尝试修复JSON格式...") |
||||||
|
|
||||||
|
# 常见修复方法 |
||||||
|
fixes = [ |
||||||
|
# 1. 如果是JSONL格式(每行一个JSON对象) |
||||||
|
lambda x: [json.loads(line) for line in x.strip().split('\n') if line.strip()], |
||||||
|
|
||||||
|
# 2. 如果缺少最外层的方括号 |
||||||
|
lambda x: json.loads('[' + x + ']'), |
||||||
|
|
||||||
|
# 3. 如果有多个JSON对象但没有用逗号分隔 |
||||||
|
lambda x: json.loads('[' + re.sub(r'}\s*{', '},{', x) + ']'), |
||||||
|
|
||||||
|
# 4. 如果有trailing comma |
||||||
|
lambda x: json.loads(re.sub(r',\s*}', '}', re.sub(r',\s*]', ']', x))), |
||||||
|
|
||||||
|
# 5. 如果单引号而非双引号 |
||||||
|
lambda x: json.loads(x.replace("'", '"')), |
||||||
|
] |
||||||
|
|
||||||
|
for i, fix_func in enumerate(fixes, 1): |
||||||
|
try: |
||||||
|
print(f"尝试修复方法 {i}...") |
||||||
|
result = fix_func(content) |
||||||
|
if isinstance(result, list) and len(result) > 0: |
||||||
|
print(f"✓ 修复成功!找到 {len(result)} 条数据") |
||||||
|
return result |
||||||
|
elif isinstance(result, dict): |
||||||
|
print(f"✓ 修复成功!找到 1 条数据") |
||||||
|
return [result] |
||||||
|
except Exception as e: |
||||||
|
print(f"✗ 修复方法 {i} 失败:{e}") |
||||||
|
|
||||||
|
print("所有修复方法都失败了") |
||||||
|
return None |
||||||
|
|
||||||
|
|
||||||
|
def load_and_analyze_json(file_path): |
||||||
|
""" |
||||||
|
加载JSON文件并统计标签分布,包含错误处理和修复功能 |
||||||
|
|
||||||
|
Args: |
||||||
|
file_path (str): JSON文件路径 |
||||||
|
|
||||||
|
Returns: |
||||||
|
tuple: (标签统计结果, 总数) |
||||||
|
""" |
||||||
|
# 首先诊断文件 |
||||||
|
diagnosis = diagnose_json_file(file_path) |
||||||
|
|
||||||
|
if diagnosis["status"] == "not_found": |
||||||
|
return None, None |
||||||
|
elif diagnosis["status"] == "empty": |
||||||
|
print("文件为空,无法分析") |
||||||
|
return None, None |
||||||
|
elif diagnosis["status"] == "valid": |
||||||
|
data = diagnosis["data"] |
||||||
|
elif diagnosis["status"] == "invalid": |
||||||
|
# 尝试修复 |
||||||
|
fixed_data = try_fix_json(diagnosis["content"]) |
||||||
|
if fixed_data is None: |
||||||
|
print("无法修复JSON格式错误") |
||||||
|
return None, None |
||||||
|
data = fixed_data |
||||||
|
else: |
||||||
|
print(f"未知错误:{diagnosis.get('error', '未知')}") |
||||||
|
return None, None |
||||||
|
|
||||||
|
# 确保数据是列表格式 |
||||||
|
if not isinstance(data, list): |
||||||
|
data = [data] |
||||||
|
|
||||||
|
print(f"\n成功加载数据,共 {len(data)} 条记录") |
||||||
|
|
||||||
|
# 检查数据结构 |
||||||
|
if len(data) == 0: |
||||||
|
print("数据为空") |
||||||
|
return None, None |
||||||
|
|
||||||
|
# 检查第一条数据的结构 |
||||||
|
first_item = data[0] |
||||||
|
print(f"第一条数据结构:{list(first_item.keys()) if isinstance(first_item, dict) else type(first_item)}") |
||||||
|
|
||||||
|
# 提取标签 |
||||||
|
labels = [] |
||||||
|
for i, item in enumerate(data): |
||||||
|
if isinstance(item, dict): |
||||||
|
if 'label' in item: |
||||||
|
labels.append(item['label']) |
||||||
|
elif 'Label' in item: |
||||||
|
labels.append(item['Label']) |
||||||
|
else: |
||||||
|
print(f"警告:第 {i + 1} 条数据缺少 'label' 字段:{item}") |
||||||
|
else: |
||||||
|
print(f"警告:第 {i + 1} 条数据不是字典格式:{item}") |
||||||
|
|
||||||
|
if not labels: |
||||||
|
print("错误:没有找到任何标签数据") |
||||||
|
return None, None |
||||||
|
|
||||||
|
# 统计标签数量 |
||||||
|
label_counts = Counter(labels) |
||||||
|
total = len(labels) |
||||||
|
|
||||||
|
# 打印统计结果 |
||||||
|
print("=" * 50) |
||||||
|
print("标签统计结果:") |
||||||
|
print("=" * 50) |
||||||
|
print(f"总数据条数:{total}") |
||||||
|
print("-" * 30) |
||||||
|
|
||||||
|
for label, count in sorted(label_counts.items()): |
||||||
|
percentage = (count / total) * 100 |
||||||
|
print(f"标签 {label}: {count:4d} 条 ({percentage:5.1f}%)") |
||||||
|
|
||||||
|
return label_counts, total |
||||||
|
|
||||||
|
|
||||||
|
def create_pie_chart(label_counts, total, save_path=None): |
||||||
|
""" |
||||||
|
创建扇形图 |
||||||
|
|
||||||
|
Args: |
||||||
|
label_counts (dict): 标签统计结果 |
||||||
|
total (int): 总数据条数 |
||||||
|
save_path (str, optional): 保存图片的路径 |
||||||
|
""" |
||||||
|
# 准备数据 |
||||||
|
labels = [] |
||||||
|
sizes = [] |
||||||
|
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FFEAA7', '#DDA0DD'] |
||||||
|
|
||||||
|
for label, count in sorted(label_counts.items()): |
||||||
|
if label == 0: |
||||||
|
labels.append(f'不分段 (Label {label})') |
||||||
|
else: |
||||||
|
labels.append(f'分段 (Label {label})') |
||||||
|
sizes.append(count) |
||||||
|
|
||||||
|
# 创建图形 |
||||||
|
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 7)) |
||||||
|
|
||||||
|
# 扇形图 |
||||||
|
wedges, texts, autotexts = ax1.pie(sizes, labels=labels, autopct='%1.1f%%', |
||||||
|
colors=colors[:len(sizes)], startangle=90, |
||||||
|
explode=[0.05] * len(sizes)) |
||||||
|
|
||||||
|
# 美化扇形图 |
||||||
|
ax1.set_title('文本分段标签分布统计', fontsize=16, fontweight='bold', pad=20) |
||||||
|
|
||||||
|
# 调整文本样式 |
||||||
|
for autotext in autotexts: |
||||||
|
autotext.set_color('white') |
||||||
|
autotext.set_fontweight('bold') |
||||||
|
autotext.set_fontsize(12) |
||||||
|
|
||||||
|
for text in texts: |
||||||
|
text.set_fontsize(11) |
||||||
|
|
||||||
|
# 柱状图 |
||||||
|
ax2.bar(range(len(label_counts)), sizes, color=colors[:len(sizes)], alpha=0.7) |
||||||
|
ax2.set_title('标签数量柱状图', fontsize=16, fontweight='bold', pad=20) |
||||||
|
ax2.set_xlabel('标签类型', fontsize=12) |
||||||
|
ax2.set_ylabel('数量', fontsize=12) |
||||||
|
|
||||||
|
# 设置x轴标签 |
||||||
|
ax2.set_xticks(range(len(label_counts))) |
||||||
|
ax2.set_xticklabels([f'Label {label}' for label in sorted(label_counts.keys())]) |
||||||
|
|
||||||
|
# 在柱状图上添加数值标签 |
||||||
|
for i, (label, count) in enumerate(sorted(label_counts.items())): |
||||||
|
percentage = (count / total) * 100 |
||||||
|
ax2.text(i, count + total * 0.01, f'{count}\n({percentage:.1f}%)', |
||||||
|
ha='center', va='bottom', fontweight='bold') |
||||||
|
|
||||||
|
# 调整布局 |
||||||
|
plt.tight_layout() |
||||||
|
|
||||||
|
# 保存图片 |
||||||
|
if save_path: |
||||||
|
try: |
||||||
|
plt.savefig(save_path, dpi=300, bbox_inches='tight') |
||||||
|
print(f"\n图片已保存到:{save_path}") |
||||||
|
except Exception as e: |
||||||
|
print(f"保存图片时出错:{e}") |
||||||
|
|
||||||
|
print("图表生成完成,请查看保存的图片文件。") |
||||||
|
|
||||||
|
# 关闭图形以释放内存 |
||||||
|
plt.close(fig) |
||||||
|
|
||||||
|
|
||||||
|
def create_detailed_report(label_counts, total, file_path): |
||||||
|
""" |
||||||
|
创建详细报告 |
||||||
|
|
||||||
|
Args: |
||||||
|
label_counts (dict): 标签统计结果 |
||||||
|
total (int): 总数据条数 |
||||||
|
file_path (str): 原始JSON文件路径 |
||||||
|
""" |
||||||
|
report = [] |
||||||
|
report.append("=" * 60) |
||||||
|
report.append("文本分段标签分布统计报告") |
||||||
|
report.append("=" * 60) |
||||||
|
report.append(f"数据源文件:{file_path}") |
||||||
|
report.append(f"分析时间:{np.datetime64('now', 'D')}") |
||||||
|
report.append(f"总数据条数:{total}") |
||||||
|
report.append("") |
||||||
|
|
||||||
|
report.append("标签分布详情:") |
||||||
|
report.append("-" * 40) |
||||||
|
|
||||||
|
for label, count in sorted(label_counts.items()): |
||||||
|
percentage = (count / total) * 100 |
||||||
|
label_desc = "不分段" if label == 0 else "分段" |
||||||
|
report.append(f"Label {label} ({label_desc}):{count:4d} 条 ({percentage:5.1f}%)") |
||||||
|
|
||||||
|
report.append("") |
||||||
|
report.append("标签含义说明:") |
||||||
|
report.append("- Label 0:两句话不需要分段,属于同一段落") |
||||||
|
report.append("- Label 1:两句话需要分段,属于不同段落") |
||||||
|
|
||||||
|
# 打印报告 |
||||||
|
for line in report: |
||||||
|
print(line) |
||||||
|
|
||||||
|
# 保存报告到文件 |
||||||
|
report_file = file_path.replace('.json', '_analysis_report.txt') |
||||||
|
try: |
||||||
|
with open(report_file, 'w', encoding='utf-8') as f: |
||||||
|
f.write('\n'.join(report)) |
||||||
|
print(f"\n详细报告已保存到:{report_file}") |
||||||
|
except Exception as e: |
||||||
|
print(f"保存报告时出错:{e}") |
||||||
|
|
||||||
|
|
||||||
|
def main(): |
||||||
|
"""主函数""" |
||||||
|
# JSON文件路径 |
||||||
|
json_file = 'test_dataset.json' |
||||||
|
|
||||||
|
print("JSON文件分析工具 - 增强版") |
||||||
|
print("=" * 50) |
||||||
|
|
||||||
|
# 加载并分析数据 |
||||||
|
label_counts, total = load_and_analyze_json(json_file) |
||||||
|
|
||||||
|
if label_counts is not None: |
||||||
|
# 创建扇形图 |
||||||
|
image_path = json_file.replace('.json', '_pie_chart.png') |
||||||
|
create_pie_chart(label_counts, total, image_path) |
||||||
|
|
||||||
|
# 创建详细报告 |
||||||
|
create_detailed_report(label_counts, total, json_file) |
||||||
|
|
||||||
|
print("\n分析完成!") |
||||||
|
print("=" * 50) |
||||||
|
else: |
||||||
|
print("分析失败,请检查文件内容和格式。") |
||||||
|
print("\n建议:") |
||||||
|
print("1. 确保文件存在且不为空") |
||||||
|
print("2. 检查JSON格式是否正确") |
||||||
|
print("3. 确保每条数据都有'label'字段") |
||||||
|
print("4. 如果是JSONL格式,确保每行都是有效的JSON对象") |
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__": |
||||||
|
main() |
||||||
@ -0,0 +1,293 @@ |
|||||||
|
#!/usr/bin/env python3 |
||||||
|
# -*- coding: utf-8 -*- |
||||||
|
""" |
||||||
|
BERT Token数量统计与可视化 |
||||||
|
统计sentence1和最后一个sentence2的token数量分布 |
||||||
|
""" |
||||||
|
|
||||||
|
import json |
||||||
|
import matplotlib.pyplot as plt |
||||||
|
import numpy as np |
||||||
|
import pandas as pd |
||||||
|
from collections import Counter |
||||||
|
from transformers import AutoTokenizer |
||||||
|
import warnings |
||||||
|
|
||||||
|
# 忽略transformers的警告 |
||||||
|
warnings.filterwarnings("ignore") |
||||||
|
|
||||||
|
# 设置matplotlib后端,避免显示问题 |
||||||
|
plt.switch_backend('Agg') |
||||||
|
|
||||||
|
|
||||||
|
def load_sentence_pairs(file_path): |
||||||
|
"""加载句子对数据""" |
||||||
|
try: |
||||||
|
with open(file_path, 'r', encoding='utf-8') as f: |
||||||
|
data = json.load(f) |
||||||
|
print(f"成功加载 {len(data)} 个句子对") |
||||||
|
return data |
||||||
|
except FileNotFoundError: |
||||||
|
print(f"错误:找不到文件 {file_path}") |
||||||
|
return None |
||||||
|
except json.JSONDecodeError: |
||||||
|
print(f"错误:JSON文件格式错误") |
||||||
|
return None |
||||||
|
except Exception as e: |
||||||
|
print(f"加载文件时发生错误:{e}") |
||||||
|
return None |
||||||
|
|
||||||
|
|
||||||
|
def initialize_tokenizer(model_name="bert-base-chinese"): |
||||||
|
"""初始化BERT tokenizer""" |
||||||
|
try: |
||||||
|
print(f"初始化 {model_name} tokenizer...") |
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
||||||
|
print("Tokenizer初始化成功") |
||||||
|
return tokenizer |
||||||
|
except Exception as e: |
||||||
|
print(f"初始化tokenizer失败:{e}") |
||||||
|
print("尝试使用备用tokenizer...") |
||||||
|
try: |
||||||
|
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased") |
||||||
|
print("成功使用多语言BERT tokenizer") |
||||||
|
return tokenizer |
||||||
|
except Exception as e2: |
||||||
|
print(f"备用tokenizer也失败:{e2}") |
||||||
|
return None |
||||||
|
|
||||||
|
|
||||||
|
def count_bert_tokens(text, tokenizer): |
||||||
|
"""计算文本的BERT token数量(不包含特殊token)""" |
||||||
|
if not text or text.strip() == "": |
||||||
|
return 0 |
||||||
|
|
||||||
|
try: |
||||||
|
# 使用tokenizer编码文本,不添加特殊token |
||||||
|
tokens = tokenizer.encode(text, add_special_tokens=False) |
||||||
|
return len(tokens) |
||||||
|
except Exception as e: |
||||||
|
print(f"计算token时出错:{e}") |
||||||
|
return 0 |
||||||
|
|
||||||
|
|
||||||
|
def get_token_range_label(token_count): |
||||||
|
"""根据token数量获取对应的区间标签""" |
||||||
|
range_start = (token_count // 100) * 100 |
||||||
|
range_end = range_start + 99 |
||||||
|
return f"{range_start}-{range_end}" |
||||||
|
|
||||||
|
|
||||||
|
def analyze_token_distribution(sentence_pairs, tokenizer): |
||||||
|
"""分析token分布""" |
||||||
|
print("\n开始分析token分布...") |
||||||
|
|
||||||
|
# 收集所有sentence1的token数量和对应的source_id |
||||||
|
sentence1_tokens = [] |
||||||
|
token_details = [] # 存储详细信息:(token_count, source_id, sentence_type, sentence_text) |
||||||
|
|
||||||
|
for pair in sentence_pairs: |
||||||
|
sentence1 = pair.get('sentence1', '') |
||||||
|
source_id = pair.get('source_id', 'unknown') |
||||||
|
token_count = count_bert_tokens(sentence1, tokenizer) |
||||||
|
sentence1_tokens.append(token_count) |
||||||
|
token_details.append((token_count, source_id, 'sentence1', sentence1)) |
||||||
|
|
||||||
|
# 获取最后一个句子对的sentence2 |
||||||
|
last_sentence2_tokens = 0 |
||||||
|
if sentence_pairs: |
||||||
|
last_pair = sentence_pairs[-1] |
||||||
|
last_sentence2 = last_pair.get('sentence2', '') |
||||||
|
last_source_id = last_pair.get('source_id', 'unknown') |
||||||
|
last_sentence2_tokens = count_bert_tokens(last_sentence2, tokenizer) |
||||||
|
if last_sentence2_tokens > 0: |
||||||
|
token_details.append((last_sentence2_tokens, last_source_id, 'sentence2', last_sentence2)) |
||||||
|
|
||||||
|
print(f"处理了 {len(sentence1_tokens)} 个sentence1") |
||||||
|
print(f"最后一个sentence2的token数量: {last_sentence2_tokens}") |
||||||
|
|
||||||
|
return sentence1_tokens, last_sentence2_tokens, token_details |
||||||
|
|
||||||
|
|
||||||
|
def create_token_distribution_chart(sentence1_tokens, last_sentence2_tokens): |
||||||
|
"""创建token分布柱状图""" |
||||||
|
print("\n生成token分布图...") |
||||||
|
|
||||||
|
# 合并所有需要统计的token数量 |
||||||
|
all_tokens = sentence1_tokens + [last_sentence2_tokens] if last_sentence2_tokens > 0 else sentence1_tokens |
||||||
|
|
||||||
|
# 计算最大token数量以确定区间范围 |
||||||
|
max_tokens = max(all_tokens) if all_tokens else 0 |
||||||
|
max_range = ((max_tokens // 100) + 1) * 100 |
||||||
|
|
||||||
|
# 创建区间 |
||||||
|
ranges = [] |
||||||
|
range_labels = [] |
||||||
|
for i in range(0, max_range, 100): |
||||||
|
ranges.append((i, i + 99)) |
||||||
|
range_labels.append(f"{i}-{i + 99}") |
||||||
|
|
||||||
|
# 统计每个区间的句子数量 |
||||||
|
range_counts = [0] * len(ranges) |
||||||
|
|
||||||
|
for token_count in all_tokens: |
||||||
|
range_index = token_count // 100 |
||||||
|
if range_index < len(range_counts): |
||||||
|
range_counts[range_index] += 1 |
||||||
|
|
||||||
|
# 创建图表 |
||||||
|
plt.figure(figsize=(12, 8)) |
||||||
|
|
||||||
|
# 创建柱状图 |
||||||
|
bars = plt.bar(range_labels, range_counts, color='skyblue', edgecolor='navy', alpha=0.7) |
||||||
|
|
||||||
|
# 设置图表属性 |
||||||
|
plt.title('BERT Token Count Distribution', fontsize=16, fontweight='bold') |
||||||
|
plt.xlabel('Token Count Range', fontsize=12) |
||||||
|
plt.ylabel('Number of Sentences', fontsize=12) |
||||||
|
plt.xticks(rotation=45, ha='right') |
||||||
|
plt.grid(axis='y', alpha=0.3) |
||||||
|
|
||||||
|
# 在柱子上添加数值标签 |
||||||
|
for bar, count in zip(bars, range_counts): |
||||||
|
if count > 0: # 只在有数据的柱子上显示标签 |
||||||
|
plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.5, |
||||||
|
str(count), ha='center', va='bottom', fontsize=10) |
||||||
|
|
||||||
|
# 调整布局 |
||||||
|
plt.tight_layout() |
||||||
|
|
||||||
|
# 显示统计信息 |
||||||
|
total_sentences = len(all_tokens) |
||||||
|
avg_tokens = np.mean(all_tokens) if all_tokens else 0 |
||||||
|
median_tokens = np.median(all_tokens) if all_tokens else 0 |
||||||
|
|
||||||
|
# 在图表上添加统计信息文本框 |
||||||
|
stats_text = f'Total Sentences: {total_sentences}\n' |
||||||
|
stats_text += f'Average Tokens: {avg_tokens:.1f}\n' |
||||||
|
stats_text += f'Median Tokens: {median_tokens:.1f}\n' |
||||||
|
stats_text += f'Max Tokens: {max_tokens}' |
||||||
|
|
||||||
|
plt.text(0.02, 0.98, stats_text, transform=plt.gca().transAxes, |
||||||
|
bbox=dict(boxstyle='round', facecolor='white', alpha=0.8), |
||||||
|
verticalalignment='top', fontsize=10) |
||||||
|
|
||||||
|
return plt |
||||||
|
|
||||||
|
|
||||||
|
def find_high_token_sentences(token_details, threshold=300): |
||||||
|
"""查找token数量超过阈值的句子""" |
||||||
|
print(f"\n=== Token数量超过{threshold}的句子 ===") |
||||||
|
|
||||||
|
high_token_sentences = [(count, source_id, sentence_type, sentence) |
||||||
|
for count, source_id, sentence_type, sentence in token_details |
||||||
|
if count > threshold] |
||||||
|
|
||||||
|
if not high_token_sentences: |
||||||
|
print(f"没有找到token数量超过{threshold}的句子") |
||||||
|
return [] |
||||||
|
|
||||||
|
# 按token数量降序排列 |
||||||
|
high_token_sentences.sort(key=lambda x: x[0], reverse=True) |
||||||
|
|
||||||
|
print(f"找到 {len(high_token_sentences)} 个token数量超过{threshold}的句子:") |
||||||
|
print("-" * 80) |
||||||
|
|
||||||
|
for i, (token_count, source_id, sentence_type, sentence) in enumerate(high_token_sentences, 1): |
||||||
|
print(f"{i}. Source ID: {source_id}") |
||||||
|
print(f" Type: {sentence_type}") |
||||||
|
print(f" Token Count: {token_count}") |
||||||
|
print(f" Content: {sentence[:100]}{'...' if len(sentence) > 100 else ''}") |
||||||
|
print("-" * 80) |
||||||
|
|
||||||
|
# 保存到CSV文件 |
||||||
|
import pandas as pd |
||||||
|
df_high_tokens = pd.DataFrame(high_token_sentences, |
||||||
|
columns=['token_count', 'source_id', 'sentence_type', 'sentence_text']) |
||||||
|
output_file = f'high_token_sentences_over_{threshold}.csv' |
||||||
|
df_high_tokens.to_csv(output_file, index=False, encoding='utf-8-sig') |
||||||
|
print(f"详细信息已保存到: {output_file}") |
||||||
|
|
||||||
|
return high_token_sentences |
||||||
|
"""打印详细统计信息""" |
||||||
|
print("\n=== 详细统计信息 ===") |
||||||
|
|
||||||
|
all_tokens = sentence1_tokens + [last_sentence2_tokens] if last_sentence2_tokens > 0 else sentence1_tokens |
||||||
|
|
||||||
|
if not all_tokens: |
||||||
|
print("没有数据可统计") |
||||||
|
return |
||||||
|
|
||||||
|
print(f"Sentence1总数: {len(sentence1_tokens)}") |
||||||
|
print(f"Last Sentence2: {'已包含' if last_sentence2_tokens > 0 else '无数据'}") |
||||||
|
print(f"总句子数: {len(all_tokens)}") |
||||||
|
print(f"平均token数: {np.mean(all_tokens):.2f}") |
||||||
|
print(f"中位数token数: {np.median(all_tokens):.2f}") |
||||||
|
print(f"最小token数: {min(all_tokens)}") |
||||||
|
print(f"最大token数: {max(all_tokens)}") |
||||||
|
print(f"标准差: {np.std(all_tokens):.2f}") |
||||||
|
|
||||||
|
# 按区间统计 |
||||||
|
print("\n=== 区间分布 ===") |
||||||
|
max_tokens = max(all_tokens) |
||||||
|
max_range = ((max_tokens // 100) + 1) * 100 |
||||||
|
|
||||||
|
for i in range(0, max_range, 100): |
||||||
|
count = sum(1 for x in all_tokens if i <= x < i + 100) |
||||||
|
if count > 0: |
||||||
|
percentage = (count / len(all_tokens)) * 100 |
||||||
|
print(f"{i}-{i + 99} tokens: {count} 句子 ({percentage:.1f}%)") |
||||||
|
|
||||||
|
|
||||||
|
def main(): |
||||||
|
"""主函数""" |
||||||
|
# 文件路径 |
||||||
|
input_file = 'segmentation_results_from_7_retried.json' |
||||||
|
|
||||||
|
# 1. 加载数据 |
||||||
|
sentence_pairs = load_sentence_pairs(input_file) |
||||||
|
if sentence_pairs is None: |
||||||
|
return |
||||||
|
|
||||||
|
# 2. 初始化tokenizer |
||||||
|
tokenizer = initialize_tokenizer("bert-base-chinese") |
||||||
|
if tokenizer is None: |
||||||
|
print("无法初始化tokenizer,程序退出") |
||||||
|
return |
||||||
|
|
||||||
|
# 3. 分析token分布 |
||||||
|
sentence1_tokens, last_sentence2_tokens, token_details = analyze_token_distribution(sentence_pairs, tokenizer) |
||||||
|
|
||||||
|
if not sentence1_tokens: |
||||||
|
print("没有找到有效的句子数据") |
||||||
|
return |
||||||
|
|
||||||
|
# 4. 查找高token数量的句子 |
||||||
|
high_token_sentences = find_high_token_sentences(token_details, threshold=300) |
||||||
|
|
||||||
|
# 5. 打印详细统计 |
||||||
|
# print_detailed_statistics(sentence1_tokens, last_sentence2_tokens) |
||||||
|
|
||||||
|
# 6. 创建可视化图表 |
||||||
|
plt = create_token_distribution_chart(sentence1_tokens, last_sentence2_tokens) |
||||||
|
|
||||||
|
# 7. 保存和显示图表 |
||||||
|
try: |
||||||
|
output_file = 'bert_token_distribution.png' |
||||||
|
plt.savefig(output_file, dpi=300, bbox_inches='tight') |
||||||
|
print(f"\n图表已保存为: {output_file}") |
||||||
|
plt.show() |
||||||
|
except Exception as e: |
||||||
|
print(f"保存或显示图表时出错: {e}") |
||||||
|
# 尝试不显示图表,只保存 |
||||||
|
try: |
||||||
|
plt.savefig('bert_token_distribution.png', dpi=300, bbox_inches='tight') |
||||||
|
print("图表已保存,但无法显示") |
||||||
|
except Exception as e2: |
||||||
|
print(f"保存图表也失败: {e2}") |
||||||
|
|
||||||
|
print("\n分析完成!") |
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__": |
||||||
|
main() |
||||||
File diff suppressed because it is too large
Load Diff
|
After Width: | Height: | Size: 189 KiB |
File diff suppressed because it is too large
Load Diff
|
After Width: | Height: | Size: 211 KiB |
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,67 @@ |
|||||||
|
import json |
||||||
|
import requests |
||||||
|
import os |
||||||
|
|
||||||
|
# 配置文件路径 |
||||||
|
test_file_path = r"D:\workstation\chinese-roberta-wwm-ext\model-train-eval-NN\AI标注\test.json" |
||||||
|
output_dir = r"D:\workstation\chinese-roberta-wwm-ext\model-train-eval-NN\AI标注\test01" |
||||||
|
|
||||||
|
# 确保输出目录存在 |
||||||
|
os.makedirs(output_dir, exist_ok=True) |
||||||
|
|
||||||
|
# 读取测试数据 |
||||||
|
with open(test_file_path, 'r', encoding='utf-8') as f: |
||||||
|
test_data = json.load(f) |
||||||
|
|
||||||
|
print(f"📁 加载测试数据: {len(test_data)} 条记录") |
||||||
|
|
||||||
|
# 服务地址 |
||||||
|
url = "http://localhost:8888/segment_batch_simple" |
||||||
|
|
||||||
|
# 准备请求数据 - 直接使用原始格式 |
||||||
|
broadcasts = test_data |
||||||
|
|
||||||
|
print(f"🚀 开始调用双路径边界分类器批量分段接口...") |
||||||
|
|
||||||
|
# 发送请求 |
||||||
|
try: |
||||||
|
response = requests.post(url, json=broadcasts) |
||||||
|
|
||||||
|
if response.status_code == 200: |
||||||
|
result = response.json() |
||||||
|
print("✅ 批量分段成功!") |
||||||
|
print(f"模型: {result['model']}") |
||||||
|
print(f"总计: {result['total']}") |
||||||
|
print(f"成功: {result['success']}") |
||||||
|
print(f"失败: {result['failed']}") |
||||||
|
print(f"处理时间: {result['processing_time']}秒") |
||||||
|
|
||||||
|
print("\n📝 分段结果:") |
||||||
|
print("=" * 80) |
||||||
|
|
||||||
|
for broadcast_id, segments in result['results'].items(): |
||||||
|
print(f"\n📻 {broadcast_id}:") |
||||||
|
|
||||||
|
if 'error' in segments: |
||||||
|
print(f"❌ 错误: {segments['error']}") |
||||||
|
else: |
||||||
|
for para_key, para_content in segments.items(): |
||||||
|
print(f" {para_key}: {para_content}") |
||||||
|
|
||||||
|
# 保存结果到文件 |
||||||
|
output_file = os.path.join(output_dir, "batch_segment_results.json") |
||||||
|
with open(output_file, 'w', encoding='utf-8') as f: |
||||||
|
json.dump(result, f, ensure_ascii=False, indent=2) |
||||||
|
|
||||||
|
print(f"\n💾 结果已保存到: {output_file}") |
||||||
|
|
||||||
|
else: |
||||||
|
print(f"❌ 请求失败: HTTP {response.status_code}") |
||||||
|
print(response.text) |
||||||
|
|
||||||
|
except requests.exceptions.ConnectionError: |
||||||
|
print("❌ 连接失败,请确保双路径边界分类器服务正在运行") |
||||||
|
print(" 启动命令: python simplified_dual_path_boundary_classifier_api.py") |
||||||
|
print(" 服务地址: http://localhost:8888") |
||||||
|
except Exception as e: |
||||||
|
print(f"❌ 调用失败: {e}") |
||||||
@ -0,0 +1,247 @@ |
|||||||
|
#!/usr/bin/env python3 |
||||||
|
# -*- coding: utf-8 -*- |
||||||
|
""" |
||||||
|
根据source_id和label标签合并段落并输出txt文件 |
||||||
|
将label=0的连续句子合并,label=1作为分界点分段 |
||||||
|
""" |
||||||
|
|
||||||
|
import json |
||||||
|
import os |
||||||
|
from collections import defaultdict |
||||||
|
from typing import List, Dict, Any |
||||||
|
|
||||||
|
|
||||||
|
def load_test_data(file_path: str) -> List[Dict[str, Any]]: |
||||||
|
"""加载测试数据""" |
||||||
|
try: |
||||||
|
with open(file_path, 'r', encoding='utf-8') as f: |
||||||
|
data = json.load(f) |
||||||
|
print(f"成功加载 {len(data)} 条数据") |
||||||
|
return data |
||||||
|
except FileNotFoundError: |
||||||
|
print(f"错误:找不到文件 {file_path}") |
||||||
|
return [] |
||||||
|
except json.JSONDecodeError as e: |
||||||
|
print(f"错误:JSON格式错误 - {e}") |
||||||
|
return [] |
||||||
|
except Exception as e: |
||||||
|
print(f"错误:加载文件时出现问题 - {e}") |
||||||
|
return [] |
||||||
|
|
||||||
|
|
||||||
|
def group_by_source_id(data: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]: |
||||||
|
"""按source_id分组数据""" |
||||||
|
grouped_data = defaultdict(list) |
||||||
|
|
||||||
|
for item in data: |
||||||
|
source_id = str(item.get('source_id', 'unknown')) |
||||||
|
grouped_data[source_id].append(item) |
||||||
|
|
||||||
|
# 对每个组内的数据按出现顺序排序(保持原有顺序) |
||||||
|
for source_id in grouped_data: |
||||||
|
# 如果数据中有索引信息,可以按索引排序 |
||||||
|
# 这里假设数据已经按正确顺序排列 |
||||||
|
pass |
||||||
|
|
||||||
|
print(f"数据按source_id分组完成,共 {len(grouped_data)} 个组") |
||||||
|
for source_id, items in grouped_data.items(): |
||||||
|
print(f" Source ID {source_id}: {len(items)} 条数据") |
||||||
|
|
||||||
|
return dict(grouped_data) |
||||||
|
|
||||||
|
|
||||||
|
def merge_paragraphs_by_labels(sentence_pairs: List[Dict[str, Any]]) -> List[str]: |
||||||
|
""" |
||||||
|
根据label合并段落 |
||||||
|
label=0: 同一段落,需要合并 |
||||||
|
label=1: 不同段落,作为分界点 |
||||||
|
""" |
||||||
|
if not sentence_pairs: |
||||||
|
return [] |
||||||
|
|
||||||
|
paragraphs = [] |
||||||
|
current_paragraph = [] |
||||||
|
|
||||||
|
# 处理第一个句子 |
||||||
|
if sentence_pairs: |
||||||
|
current_paragraph.append(sentence_pairs[0]['sentence1']) |
||||||
|
|
||||||
|
# 遍历所有句子对 |
||||||
|
for i, pair in enumerate(sentence_pairs): |
||||||
|
sentence2 = pair['sentence2'] |
||||||
|
label = pair['label'] |
||||||
|
|
||||||
|
if label == 0: |
||||||
|
# 同一段落,继续添加到当前段落 |
||||||
|
# 只添加sentence2,因为sentence1已经在上一轮添加过了 |
||||||
|
if sentence2 not in current_paragraph: # 避免重复 |
||||||
|
current_paragraph.append(sentence2) |
||||||
|
|
||||||
|
elif label == 1: |
||||||
|
# 不同段落,结束当前段落,开始新段落 |
||||||
|
if current_paragraph: |
||||||
|
paragraph_text = ''.join(current_paragraph) |
||||||
|
if paragraph_text.strip(): # 确保段落不为空 |
||||||
|
paragraphs.append(paragraph_text.strip()) |
||||||
|
|
||||||
|
# 开始新段落 |
||||||
|
current_paragraph = [sentence2] |
||||||
|
|
||||||
|
# 处理最后一个段落 |
||||||
|
if current_paragraph: |
||||||
|
paragraph_text = ''.join(current_paragraph) |
||||||
|
if paragraph_text.strip(): |
||||||
|
paragraphs.append(paragraph_text.strip()) |
||||||
|
|
||||||
|
return paragraphs |
||||||
|
|
||||||
|
|
||||||
|
def process_single_source(source_id: str, sentence_pairs: List[Dict[str, Any]]) -> Dict[str, Any]: |
||||||
|
"""处理单个source_id的数据""" |
||||||
|
print(f"\n处理Source ID: {source_id}") |
||||||
|
print(f"句子对数量: {len(sentence_pairs)}") |
||||||
|
|
||||||
|
# 统计标签分布 |
||||||
|
label_counts = defaultdict(int) |
||||||
|
for pair in sentence_pairs: |
||||||
|
label_counts[pair['label']] += 1 |
||||||
|
|
||||||
|
print(f"标签分布: Label 0 (同段): {label_counts[0]}, Label 1 (分段): {label_counts[1]}") |
||||||
|
|
||||||
|
# 合并段落 |
||||||
|
paragraphs = merge_paragraphs_by_labels(sentence_pairs) |
||||||
|
|
||||||
|
print(f"合并后段落数: {len(paragraphs)}") |
||||||
|
|
||||||
|
# 统计信息 |
||||||
|
total_chars = sum(len(p) for p in paragraphs) |
||||||
|
avg_paragraph_length = total_chars / len(paragraphs) if paragraphs else 0 |
||||||
|
|
||||||
|
return { |
||||||
|
'source_id': source_id, |
||||||
|
'original_pairs_count': len(sentence_pairs), |
||||||
|
'merged_paragraphs_count': len(paragraphs), |
||||||
|
'label_distribution': dict(label_counts), |
||||||
|
'total_characters': total_chars, |
||||||
|
'avg_paragraph_length': avg_paragraph_length, |
||||||
|
'paragraphs': paragraphs |
||||||
|
} |
||||||
|
|
||||||
|
|
||||||
|
def save_to_txt(results: Dict[str, Dict[str, Any]], output_file: str): |
||||||
|
"""保存结果到txt文件""" |
||||||
|
with open(output_file, 'w', encoding='utf-8') as f: |
||||||
|
f.write("=" * 80 + "\n") |
||||||
|
f.write("段落合并结果\n") |
||||||
|
f.write("根据source_id和label标签合并的段落文本\n") |
||||||
|
f.write("=" * 80 + "\n\n") |
||||||
|
|
||||||
|
for source_id, result in results.items(): |
||||||
|
f.write(f"【Source ID: {source_id}】\n") |
||||||
|
f.write(f"原始句子对数量: {result['original_pairs_count']}\n") |
||||||
|
f.write(f"合并后段落数量: {result['merged_paragraphs_count']}\n") |
||||||
|
f.write(f"标签分布: {result['label_distribution']}\n") |
||||||
|
f.write(f"总字符数: {result['total_characters']}\n") |
||||||
|
f.write(f"平均段落长度: {result['avg_paragraph_length']:.1f} 字符\n") |
||||||
|
f.write("-" * 60 + "\n") |
||||||
|
|
||||||
|
for i, paragraph in enumerate(result['paragraphs'], 1): |
||||||
|
f.write(f"段落 {i}:\n{paragraph}\n\n") |
||||||
|
|
||||||
|
f.write("=" * 80 + "\n\n") |
||||||
|
|
||||||
|
|
||||||
|
def save_summary_json(results: Dict[str, Dict[str, Any]], output_file: str): |
||||||
|
"""保存统计摘要到JSON文件""" |
||||||
|
summary = { |
||||||
|
'total_source_ids': len(results), |
||||||
|
'total_original_pairs': sum(r['original_pairs_count'] for r in results.values()), |
||||||
|
'total_merged_paragraphs': sum(r['merged_paragraphs_count'] for r in results.values()), |
||||||
|
'total_characters': sum(r['total_characters'] for r in results.values()), |
||||||
|
'source_details': {} |
||||||
|
} |
||||||
|
|
||||||
|
for source_id, result in results.items(): |
||||||
|
summary['source_details'][source_id] = { |
||||||
|
'original_pairs_count': result['original_pairs_count'], |
||||||
|
'merged_paragraphs_count': result['merged_paragraphs_count'], |
||||||
|
'label_distribution': result['label_distribution'], |
||||||
|
'total_characters': result['total_characters'], |
||||||
|
'avg_paragraph_length': result['avg_paragraph_length'] |
||||||
|
} |
||||||
|
|
||||||
|
with open(output_file, 'w', encoding='utf-8') as f: |
||||||
|
json.dump(summary, f, ensure_ascii=False, indent=2) |
||||||
|
|
||||||
|
|
||||||
|
def main(): |
||||||
|
"""主函数""" |
||||||
|
# 配置文件路径 |
||||||
|
input_file = 'segmentation_results_from_new50.json' # 输入文件路径 |
||||||
|
output_txt = 'merged_paragraphs.txt' # 输出txt文件 |
||||||
|
output_summary = 'merge_summary.json' # 输出统计摘要 |
||||||
|
|
||||||
|
print("=" * 80) |
||||||
|
print("段落合并处理程序") |
||||||
|
print("根据source_id和label标签合并段落") |
||||||
|
print("=" * 80) |
||||||
|
|
||||||
|
# 检查输入文件 |
||||||
|
if not os.path.exists(input_file): |
||||||
|
print(f"错误:输入文件 {input_file} 不存在!") |
||||||
|
print("请确保test.json文件在当前目录下") |
||||||
|
return |
||||||
|
|
||||||
|
try: |
||||||
|
# 1. 加载数据 |
||||||
|
data = load_test_data(input_file) |
||||||
|
if not data: |
||||||
|
print("没有有效数据可处理") |
||||||
|
return |
||||||
|
|
||||||
|
# 2. 按source_id分组 |
||||||
|
grouped_data = group_by_source_id(data) |
||||||
|
|
||||||
|
# 3. 处理每个source_id的数据 |
||||||
|
results = {} |
||||||
|
total_paragraphs = 0 |
||||||
|
|
||||||
|
for source_id, sentence_pairs in grouped_data.items(): |
||||||
|
result = process_single_source(source_id, sentence_pairs) |
||||||
|
results[source_id] = result |
||||||
|
total_paragraphs += result['merged_paragraphs_count'] |
||||||
|
|
||||||
|
# 4. 保存结果 |
||||||
|
print(f"\n保存结果...") |
||||||
|
save_to_txt(results, output_txt) |
||||||
|
save_summary_json(results, output_summary) |
||||||
|
|
||||||
|
# 5. 输出总结 |
||||||
|
print("=" * 80) |
||||||
|
print("处理完成!") |
||||||
|
print("=" * 80) |
||||||
|
print(f"📊 处理统计:") |
||||||
|
print(f" 🔹 处理的Source ID数量: {len(results)}") |
||||||
|
print(f" 🔹 原始句子对总数: {sum(r['original_pairs_count'] for r in results.values())}") |
||||||
|
print(f" 🔹 合并后段落总数: {total_paragraphs}") |
||||||
|
print(f" 🔹 总字符数: {sum(r['total_characters'] for r in results.values())}") |
||||||
|
|
||||||
|
print(f"\n📁 输出文件:") |
||||||
|
print(f" 📄 {output_txt} - 合并后的段落文本") |
||||||
|
print(f" 📄 {output_summary} - 处理统计摘要") |
||||||
|
|
||||||
|
print(f"\n📋 各Source ID详情:") |
||||||
|
for source_id, result in results.items(): |
||||||
|
print( |
||||||
|
f" Source {source_id}: {result['original_pairs_count']} 对 → {result['merged_paragraphs_count']} 段") |
||||||
|
|
||||||
|
print(f"\n✅ 段落合并完成!请查看 {output_txt} 文件") |
||||||
|
|
||||||
|
except Exception as e: |
||||||
|
print(f"❌ 处理过程中出现错误: {str(e)}") |
||||||
|
import traceback |
||||||
|
traceback.print_exc() |
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__": |
||||||
|
main() |
||||||
File diff suppressed because one or more lines are too long
|
After Width: | Height: | Size: 271 KiB |
|
After Width: | Height: | Size: 208 KiB |
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,70 @@ |
|||||||
|
{ |
||||||
|
"overall_accuracy": 0.9827222924568058, |
||||||
|
"class_metrics": { |
||||||
|
"class_0_same_paragraph": { |
||||||
|
"precision": 0.995013061030634, |
||||||
|
"recall": 0.9856504351917196, |
||||||
|
"f1_score": 0.9903096194753014, |
||||||
|
"support": 4251 |
||||||
|
}, |
||||||
|
"class_1_different_paragraph": { |
||||||
|
"precision": 0.8859813084112149, |
||||||
|
"recall": 0.9575757575757575, |
||||||
|
"f1_score": 0.920388349514563, |
||||||
|
"support": 495 |
||||||
|
} |
||||||
|
}, |
||||||
|
"confusion_matrix": [ |
||||||
|
[ |
||||||
|
4190, |
||||||
|
61 |
||||||
|
], |
||||||
|
[ |
||||||
|
21, |
||||||
|
474 |
||||||
|
] |
||||||
|
], |
||||||
|
"classification_report": { |
||||||
|
"Same Paragraph (0)": { |
||||||
|
"precision": 0.995013061030634, |
||||||
|
"recall": 0.9856504351917196, |
||||||
|
"f1-score": 0.9903096194753014, |
||||||
|
"support": 4251.0 |
||||||
|
}, |
||||||
|
"Different Paragraph (1)": { |
||||||
|
"precision": 0.8859813084112149, |
||||||
|
"recall": 0.9575757575757575, |
||||||
|
"f1-score": 0.920388349514563, |
||||||
|
"support": 495.0 |
||||||
|
}, |
||||||
|
"accuracy": 0.9827222924568058, |
||||||
|
"macro avg": { |
||||||
|
"precision": 0.9404971847209245, |
||||||
|
"recall": 0.9716130963837386, |
||||||
|
"f1-score": 0.9553489844949322, |
||||||
|
"support": 4746.0 |
||||||
|
}, |
||||||
|
"weighted avg": { |
||||||
|
"precision": 0.9836412284249424, |
||||||
|
"recall": 0.9827222924568058, |
||||||
|
"f1-score": 0.9830169459332522, |
||||||
|
"support": 4746.0 |
||||||
|
} |
||||||
|
}, |
||||||
|
"test_samples_count": 4746, |
||||||
|
"boundary_score_stats": { |
||||||
|
"mean": 0.14779670536518097, |
||||||
|
"std": 0.29127171635627747, |
||||||
|
"min": 0.042458675801754, |
||||||
|
"max": 0.9999679327011108 |
||||||
|
}, |
||||||
|
"evaluation_info": { |
||||||
|
"evaluation_time": "0:01:19.859698", |
||||||
|
"device_used": "cuda", |
||||||
|
"model_type": "DualPathBoundaryClassifier", |
||||||
|
"max_length": 384, |
||||||
|
"batch_size": 32, |
||||||
|
"test_file": "D:\\workstation\\AI标注\\数据清洗+json\\test_dataset.json", |
||||||
|
"trained_model_path": "D:\\workstation\\chinese-roberta-wwm-ext\\model-train-eval-NN\\model_train" |
||||||
|
} |
||||||
|
} |
||||||
@ -0,0 +1,41 @@ |
|||||||
|
{ |
||||||
|
"model_info": { |
||||||
|
"model_type": "Dual Path Boundary Classifier", |
||||||
|
"base_model": "Chinese-RoBERTa-WWM-Ext", |
||||||
|
"max_length": 384, |
||||||
|
"trained_model_path": "D:\\workstation\\chinese-roberta-wwm-ext\\model-train-eval-NN\\model_train" |
||||||
|
}, |
||||||
|
"test_results": { |
||||||
|
"overall_accuracy": 0.9827222924568058, |
||||||
|
"total_samples": 4746, |
||||||
|
"correct_predictions": 4664, |
||||||
|
"incorrect_predictions": 82 |
||||||
|
}, |
||||||
|
"class_performance": { |
||||||
|
"class_0_same_paragraph": { |
||||||
|
"precision": 0.995013061030634, |
||||||
|
"recall": 0.9856504351917196, |
||||||
|
"f1_score": 0.9903096194753014, |
||||||
|
"support": 4251 |
||||||
|
}, |
||||||
|
"class_1_different_paragraph": { |
||||||
|
"precision": 0.8859813084112149, |
||||||
|
"recall": 0.9575757575757575, |
||||||
|
"f1_score": 0.920388349514563, |
||||||
|
"support": 495 |
||||||
|
} |
||||||
|
}, |
||||||
|
"boundary_detection": { |
||||||
|
"mean": 0.14779670536518097, |
||||||
|
"std": 0.29127171635627747, |
||||||
|
"min": 0.042458675801754, |
||||||
|
"max": 0.9999679327011108 |
||||||
|
}, |
||||||
|
"files_generated": [ |
||||||
|
"test_results_detailed.json", |
||||||
|
"detailed_predictions.json", |
||||||
|
"confusion_matrix_test_results.png", |
||||||
|
"class_performance_analysis.png", |
||||||
|
"test_summary.json" |
||||||
|
] |
||||||
|
} |
||||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
Loading…
Reference in new issue