临时代码库 - preprocess.py

# preprocess.py
import jieba
import os

def load_data(file_path):
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"未找到文件: {file_path}")
    with open(file_path, 'r', encoding='utf-8') as file:
        # 统一过滤空行，确保各脚本索引一致
        docs = [line.strip() for line in file.readlines() if line.strip()]
    return docs

def segment_text(docs, stopwords_path, userdict_path=None):
    if userdict_path and os.path.exists(userdict_path):
        jieba.load_userdict(userdict_path) #
    
    stopwords = set([line.strip() for line in open(stopwords_path, 'r', encoding='UTF-8').readlines()]) if os.path.exists(stopwords_path) else set()
    
    cutted_text = []
    for line_text in docs:
        # 教程建议：BERT类模型通常限制512长度
        if len(line_text) > 512:
            line_text = line_text[:512]
        seg_list = jieba.cut(line_text)
        # 过滤单字、数字及停用词
        filtered_words = [w for w in seg_list if w not in stopwords and len(w) >= 2 and not w.isdigit()]
        cutted_text.append(' '.join(filtered_words))
    return cutted_text

if __name__ == "__main__":
    docs = load_data('./data/文本.txt')
    cutted = segment_text(docs, './data/stopwords.txt', './data/userdict.txt')
    with open('./data/切词.txt', 'w', encoding='utf-8') as f:
        f.write('\n'.join(cutted))
    print("分词预处理完成。")
直链已复制！
可使用 wget 直接下载