# preprocess.py
import jieba
import os
def load_data(file_path):
if not os.path.exists(file_path):
raise FileNotFoundError(f"未找到文件: {file_path}")
with open(file_path, 'r', encoding='utf-8') as file:
# 统一过滤空行,确保各脚本索引一致
docs = [line.strip() for line in file.readlines() if line.strip()]
return docs
def segment_text(docs, stopwords_path, userdict_path=None):
if userdict_path and os.path.exists(userdict_path):
jieba.load_userdict(userdict_path) #
stopwords = set([line.strip() for line in open(stopwords_path, 'r', encoding='UTF-8').readlines()]) if os.path.exists(stopwords_path) else set()
cutted_text = []
for line_text in docs:
# 教程建议:BERT类模型通常限制512长度
if len(line_text) > 512:
line_text = line_text[:512]
seg_list = jieba.cut(line_text)
# 过滤单字、数字及停用词
filtered_words = [w for w in seg_list if w not in stopwords and len(w) >= 2 and not w.isdigit()]
cutted_text.append(' '.join(filtered_words))
return cutted_text
if __name__ == "__main__":
docs = load_data('./data/文本.txt')
cutted = segment_text(docs, './data/stopwords.txt', './data/userdict.txt')
with open('./data/切词.txt', 'w', encoding='utf-8') as f:
f.write('\n'.join(cutted))
print("分词预处理完成。")