临时代码库 - _01cut

import jieba
import os

# 1. 定义文件路径 (方便修改)
input_file_path = '../data/文本.txt'
output_file_path = '../data/切词.txt'
user_dict_path = 'userdict.txt'
stop_words_path = 'stopwords.txt'

# 2. 确保输出目录存在
output_dir = os.path.dirname(output_file_path)
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    print(f"已创建目录: {output_dir}")

# 3. 读取数据
try:
    # 注意读取中文时，设置encoding='utf-8'
    with open(input_file_path, 'r', encoding='utf-8') as file:
        # 使用 strip() 去除首尾空白，并过滤掉空行
        docs = [line.strip() for line in file.readlines() if line.strip()]
        print(f'成功读取文本，共 {len(docs)} 行。示例：{docs[0][:20]}...')
except FileNotFoundError:
    print(f"错误: 找不到文件 {input_file_path}")
    exit()

# 4. 加载 jieba 配置
if os.path.exists(user_dict_path):
    jieba.load_userdict(user_dict_path)
else:
    print(f"警告: 找不到自定义词典 {user_dict_path}，将跳过加载。")

stopwords = []
if os.path.exists(stop_words_path):
    stopwords = [line.strip() for line in open(stop_words_path, encoding='UTF-8').readlines()]
    print(f'停用词加载成功，前5个：{stopwords[:5]}')
else:
    print(f"警告: 找不到停用词表 {stop_words_path}，将不使用停用词过滤。")

# 5. 分词处理
cutted_text = []
for i, line_text in enumerate(docs):
    # ⚠️ 截断逻辑：保留前512个字符
    if len(line_text) > 512:
        line_text = line_text[:512] 
    
    seg_list = jieba.cut(line_text)
    
    # 过滤
    filtered_words = []
    for word in seg_list:
        # 去除停用词 and 单词长度>=2 and 不能是纯数字
        if (word not in stopwords and len(word) >= 2 and not word.isdigit()): 
            filtered_words.append(word)
    
    # 为了避免打印太多刷屏，这里只打印前5行日志
    if i < 5:
        print(f"行 {i} 处理结果: {filtered_words}")
        
    cutted_text.append(' '.join(filtered_words))

print(f'切词完成，共处理 {len(cutted_text)} 行。')

# 6. 保存结果
with open(output_file_path, 'w', encoding='utf-8') as f:
    f.write('\n'.join(cutted_text))
    print(f"结果已保存至: {output_file_path}")
链接已复制！
可在终端使用 wget "链接" 下载