You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

155 lines
5.2 KiB

import re
import json
import pandas as pd
import os
def split_sentences(text):
# 使用捕获组来保留分隔符
parts = re.split(r'([。!?])', text)
# 重新组合句子和标点符号
sentences = []
for i in range(0, len(parts), 2):
if i < len(parts) and parts[i].strip():
# 如果有对应的标点符号,就加上
punctuation = parts[i + 1] if i + 1 < len(parts) else ''
sentence = parts[i].strip() + punctuation
sentences.append(sentence)
return sentences
def create_sentence_pairs(sentences):
pairs = []
for i in range(len(sentences) - 1):
pair = {
"sentence1": sentences[i],
"sentence2": sentences[i + 1],
"label": -1 # 待标注
}
pairs.append(pair)
return pairs
# 从CSV文件中读取所有内容
try:
# 尝试不同的编码格式读取CSV文件
encodings = ['utf-8', 'gbk', 'gb2312', 'utf-8-sig', 'latin1']
df = None
for encoding in encodings:
try:
print(f"尝试使用 {encoding} 编码读取文件...")
df = pd.read_csv('batch_deduplication_results_619-1103_01.csv', encoding=encoding)
print(f"成功使用 {encoding} 编码读取文件")
break
except UnicodeDecodeError:
continue
if df is None:
print("错误:尝试了所有常见编码都无法读取文件")
exit()
except FileNotFoundError:
print("错误:找不到文件 'batch_deduplication_results_619-1103_01.csv'")
exit()
except Exception as e:
print(f"读取CSV文件时发生错误:{e}")
exit()
# 创建输出目录
output_dir = 'sentence_pairs_output_all'
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# 汇总所有数据
all_sentence_pairs = []
summary_info = []
print(f"CSV文件共有 {len(df)} 行数据")
print("开始遍历所有ID...")
# 遍历所有行
for index, row in df.iterrows():
try:
current_id = row['id']
raw_text = row['final_processed_text']
# 检查文本是否为空
if pd.isna(raw_text) or str(raw_text).strip() == '':
print(f"ID {current_id}: 文本内容为空,跳过")
summary_info.append({
'id': current_id,
'status': '文本为空',
'sentences_count': 0,
'pairs_count': 0
})
continue
# 执行分割和配对
sentences = split_sentences(str(raw_text))
sentence_pairs = create_sentence_pairs(sentences)
# 为每个句子对添加来源ID
for pair in sentence_pairs:
pair['source_id'] = current_id
# 添加到汇总数据
all_sentence_pairs.extend(sentence_pairs)
# 为每个ID单独保存文件
if sentence_pairs: # 只有当有句子对时才保存
filename = f'sentence_pairs_id_{current_id}.json'
filepath = os.path.join(output_dir, filename)
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(sentence_pairs, f, ensure_ascii=False, indent=2)
# 记录处理信息
summary_info.append({
'id': current_id,
'status': '成功处理',
'sentences_count': len(sentences),
'pairs_count': len(sentence_pairs),
'text_length': len(str(raw_text))
})
print(f"ID {current_id}: 分割出 {len(sentences)} 个句子,生成 {len(sentence_pairs)} 个句子对")
except Exception as e:
print(f"处理ID {current_id} 时发生错误:{e}")
summary_info.append({
'id': current_id,
'status': f'错误: {str(e)}',
'sentences_count': 0,
'pairs_count': 0
})
# 保存汇总的所有句子对数据
print("\n保存汇总数据...")
with open('all_sentence_pairs_for_annotation.json', 'w', encoding='utf-8') as f:
json.dump(all_sentence_pairs, f, ensure_ascii=False, indent=2)
# 保存处理摘要
summary_df = pd.DataFrame(summary_info)
summary_df.to_csv('processing_summary.csv', index=False, encoding='utf-8-sig')
# 统计信息
total_sentences = sum([info['sentences_count'] for info in summary_info])
total_pairs = sum([info['pairs_count'] for info in summary_info])
successful_ids = len([info for info in summary_info if info['status'] == '成功处理'])
print(f"\n=== 处理完成 ===")
print(f"总计处理了 {len(df)} 个ID")
print(f"成功处理 {successful_ids} 个ID")
print(f"总计分割出 {total_sentences} 个句子")
print(f"总计生成 {total_pairs} 个句子对")
print(f"汇总数据保存到: all_sentence_pairs_for_annotation.json")
print(f"单独文件保存在: {output_dir}/ 目录")
print(f"处理摘要保存到: processing_summary.csv")
# 显示前几个句子对的示例
if all_sentence_pairs:
print("\n前3个句子对示例:")
for i in range(min(3, len(all_sentence_pairs))):
print(f"\n{i + 1}对 (来源ID: {all_sentence_pairs[i]['source_id']}):")
print(f"句子1: {all_sentence_pairs[i]['sentence1']}")
print(f"句子2: {all_sentence_pairs[i]['sentence2']}")
print(f"标签: {all_sentence_pairs[i]['label']}")