from extract import system_prompt from schema import novel_schema from LLM import DeepseekChat from utils import ReadFiles from tqdm import tqdm import json import platform # Windows multiprocessing兼容性修复 if platform.system() == "Windows": import multiprocessing multiprocessing.freeze_support() def main(): file_path = './data/test.txt' model_path = '/mnt/g/Project02/AITrain/Qwen/Qwen3-8B-AWQ' docs = ReadFiles(file_path).get_content(max_token_len=500, cover_content=0) sys_prompt = system_prompt(novel_schema) #model = DeepseekChat() 使用api需要配置url 和 api key model = DeepseekChat(path = model_path, use_api=False) #使用本地模型,需要修改对应的模型地址 file_name = file_path.split('/')[-1].split('.')[0] try: for i in tqdm(range(len(docs))): response = model.chat(sys_prompt, docs[i]) # 清理响应格式,去除markdown代码块 response = response.strip() if response.startswith('```json'): response = response[7:] # 去除开头的```json if response.startswith('```'): response = response[3:] # 去除开头的``` if response.endswith('```'): response = response[:-3] # 去除结尾的``` response = response.strip() try: response = json.loads(response) for item in response: # 数据质量检查:过滤空的instruction或output if (item.get('instruction', '').strip() and item.get('output', '').strip() and item.get('character', '').strip()): with open(f'{file_name}.jsonl', 'a', encoding='utf-8') as f: json.dump(item, f, ensure_ascii=False) f.write('\n') else: print(f"跳过空字段数据: instruction='{item.get('instruction', '')}', output='{item.get('output', '')}', character='{item.get('character', '')}'") except Exception as e: print(f"解析错误: {e}") print(f"原始响应: {repr(response[:200])}") # 打印前200字符用于调试 finally: # 确保在程序结束时清理LLM实例 print("Cleaning up model resources...") model.cleanup() if __name__ == '__main__': main()