You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
155 lines
5.2 KiB
155 lines
5.2 KiB
import re |
|
import json |
|
import pandas as pd |
|
import os |
|
|
|
def split_sentences(text): |
|
# 使用捕获组来保留分隔符 |
|
parts = re.split(r'([。!?])', text) |
|
|
|
# 重新组合句子和标点符号 |
|
sentences = [] |
|
for i in range(0, len(parts), 2): |
|
if i < len(parts) and parts[i].strip(): |
|
# 如果有对应的标点符号,就加上 |
|
punctuation = parts[i + 1] if i + 1 < len(parts) else '' |
|
sentence = parts[i].strip() + punctuation |
|
sentences.append(sentence) |
|
|
|
return sentences |
|
def create_sentence_pairs(sentences): |
|
pairs = [] |
|
for i in range(len(sentences) - 1): |
|
pair = { |
|
"sentence1": sentences[i], |
|
"sentence2": sentences[i + 1], |
|
"label": -1 # 待标注 |
|
} |
|
pairs.append(pair) |
|
return pairs |
|
|
|
|
|
# 从CSV文件中读取所有内容 |
|
try: |
|
# 尝试不同的编码格式读取CSV文件 |
|
encodings = ['utf-8', 'gbk', 'gb2312', 'utf-8-sig', 'latin1'] |
|
df = None |
|
|
|
for encoding in encodings: |
|
try: |
|
print(f"尝试使用 {encoding} 编码读取文件...") |
|
df = pd.read_csv('batch_deduplication_results_619-1103_01.csv', encoding=encoding) |
|
print(f"成功使用 {encoding} 编码读取文件") |
|
break |
|
except UnicodeDecodeError: |
|
continue |
|
|
|
if df is None: |
|
print("错误:尝试了所有常见编码都无法读取文件") |
|
exit() |
|
|
|
except FileNotFoundError: |
|
print("错误:找不到文件 'batch_deduplication_results_619-1103_01.csv'") |
|
exit() |
|
except Exception as e: |
|
print(f"读取CSV文件时发生错误:{e}") |
|
exit() |
|
|
|
# 创建输出目录 |
|
output_dir = 'sentence_pairs_output_all' |
|
if not os.path.exists(output_dir): |
|
os.makedirs(output_dir) |
|
|
|
# 汇总所有数据 |
|
all_sentence_pairs = [] |
|
summary_info = [] |
|
|
|
print(f"CSV文件共有 {len(df)} 行数据") |
|
print("开始遍历所有ID...") |
|
|
|
# 遍历所有行 |
|
for index, row in df.iterrows(): |
|
try: |
|
current_id = row['id'] |
|
raw_text = row['final_processed_text'] |
|
|
|
# 检查文本是否为空 |
|
if pd.isna(raw_text) or str(raw_text).strip() == '': |
|
print(f"ID {current_id}: 文本内容为空,跳过") |
|
summary_info.append({ |
|
'id': current_id, |
|
'status': '文本为空', |
|
'sentences_count': 0, |
|
'pairs_count': 0 |
|
}) |
|
continue |
|
|
|
# 执行分割和配对 |
|
sentences = split_sentences(str(raw_text)) |
|
sentence_pairs = create_sentence_pairs(sentences) |
|
|
|
# 为每个句子对添加来源ID |
|
for pair in sentence_pairs: |
|
pair['source_id'] = current_id |
|
|
|
# 添加到汇总数据 |
|
all_sentence_pairs.extend(sentence_pairs) |
|
|
|
# 为每个ID单独保存文件 |
|
if sentence_pairs: # 只有当有句子对时才保存 |
|
filename = f'sentence_pairs_id_{current_id}.json' |
|
filepath = os.path.join(output_dir, filename) |
|
with open(filepath, 'w', encoding='utf-8') as f: |
|
json.dump(sentence_pairs, f, ensure_ascii=False, indent=2) |
|
|
|
# 记录处理信息 |
|
summary_info.append({ |
|
'id': current_id, |
|
'status': '成功处理', |
|
'sentences_count': len(sentences), |
|
'pairs_count': len(sentence_pairs), |
|
'text_length': len(str(raw_text)) |
|
}) |
|
|
|
print(f"ID {current_id}: 分割出 {len(sentences)} 个句子,生成 {len(sentence_pairs)} 个句子对") |
|
|
|
except Exception as e: |
|
print(f"处理ID {current_id} 时发生错误:{e}") |
|
summary_info.append({ |
|
'id': current_id, |
|
'status': f'错误: {str(e)}', |
|
'sentences_count': 0, |
|
'pairs_count': 0 |
|
}) |
|
|
|
# 保存汇总的所有句子对数据 |
|
print("\n保存汇总数据...") |
|
with open('all_sentence_pairs_for_annotation.json', 'w', encoding='utf-8') as f: |
|
json.dump(all_sentence_pairs, f, ensure_ascii=False, indent=2) |
|
|
|
# 保存处理摘要 |
|
summary_df = pd.DataFrame(summary_info) |
|
summary_df.to_csv('processing_summary.csv', index=False, encoding='utf-8-sig') |
|
|
|
# 统计信息 |
|
total_sentences = sum([info['sentences_count'] for info in summary_info]) |
|
total_pairs = sum([info['pairs_count'] for info in summary_info]) |
|
successful_ids = len([info for info in summary_info if info['status'] == '成功处理']) |
|
|
|
print(f"\n=== 处理完成 ===") |
|
print(f"总计处理了 {len(df)} 个ID") |
|
print(f"成功处理 {successful_ids} 个ID") |
|
print(f"总计分割出 {total_sentences} 个句子") |
|
print(f"总计生成 {total_pairs} 个句子对") |
|
print(f"汇总数据保存到: all_sentence_pairs_for_annotation.json") |
|
print(f"单独文件保存在: {output_dir}/ 目录") |
|
print(f"处理摘要保存到: processing_summary.csv") |
|
|
|
# 显示前几个句子对的示例 |
|
if all_sentence_pairs: |
|
print("\n前3个句子对示例:") |
|
for i in range(min(3, len(all_sentence_pairs))): |
|
print(f"\n第{i + 1}对 (来源ID: {all_sentence_pairs[i]['source_id']}):") |
|
print(f"句子1: {all_sentence_pairs[i]['sentence1']}") |
|
print(f"句子2: {all_sentence_pairs[i]['sentence2']}") |
|
print(f"标签: {all_sentence_pairs[i]['label']}") |