1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
| import json from datasets import load_dataset import openai from tqdm import tqdm import os import concurrent.futures
OUTPUT_FILE = "data/planning_coding.jsonl" OPENAI_MODEL = "deepseek-v3" MAX_WORKERS = 5 PLANNING_PROMPT = """你是一个高级算法工程师。真正优秀的算法工程师总会在写代码之前做好任务的规划,因此,请你在解决这些问题之前**显式地做出行之有效的任务规划**,并说“真正的大师总是会在规划之后再动手”。
你需要解决的问题是: {task_description}
你可以看到标准答案,但是**请你根据你的规划逐步生成代码**,你必须输出你的一步步完成规划的轨迹!最关键的不是答案,而是你的规划能力!
{GT}
请严格按照以下格式回答:
真正的大师总是需要规划之后再动手!我先做一个规划: ### Planning 1. ... 2. ... 3. ...
### Code
#### Solving Sub-Tasks1 ```the programming language ... /`/`/`
#### Solving Sub-Tasks2 ```the programming language ... /```
#### Solving Sub-Tasks3 ```the programming language ... /```
#### Solving Sub-Tasks4 ```the programming language ... /```
...
#### Final Code ```the programming language ... /```
"""
def get_dataset(): """加载数据集并返回编码任务列表。""" ds = load_dataset("HuggingFaceH4/CodeAlpaca_20K", split="train") coding_tasks = [x for x in ds] print(f"The length of the coding tasks: {len(coding_tasks)}") return coding_tasks
def generate_planning_code(task_description: str, GT: str) -> str: """调用OpenAI API生成带规划的代码,这是I/O密集型操作。""" prompt = PLANNING_PROMPT.format(task_description=task_description, GT=GT) client = openai.OpenAI(api_key="EMPTY", base_url="http://127.0.0.1:18889/v1") try: response = client.chat.completions.create( model=OPENAI_MODEL, messages=[{"role": "user", "content": prompt}], temperature=0.7, max_tokens=2048, ) return response.choices[0].message.content.strip() except Exception as e: print(f"[ERROR] API call failed: {e}") return f"[ERROR] {e}"
def process_task(task: dict) -> dict: """单个任务的处理器,用于在线程池中执行。""" full_response = generate_planning_code( task_description=task["prompt"], GT=task["completion"] ) return {"instruction": task, "input": "", "output": full_response}
def main(): """主函数,负责加载数据、设置并发执行和结果收集。""" print("Loading Datasets...") coding_tasks = get_dataset() os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True) tasks_completed = 0
print(f"Starting concurrent data generation with {MAX_WORKERS} workers...") with open(OUTPUT_FILE, "w", encoding="utf-8") as f: with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: future_to_task = {executor.submit(process_task, task): task for task in coding_tasks} for future in tqdm( concurrent.futures.as_completed(future_to_task), total=len(coding_tasks), desc="Generating planning data (Writing on completion)" ): try: result_item = future.result() f.write(json.dumps(result_item, ensure_ascii=False) + "\n") f.flush() tasks_completed += 1 except Exception as exc: task_failed = future_to_task[future] print(f'\n[Task Error] Task for prompt "{task_failed.get("prompt", "Unknown")[:30]}..." generated an exception: {exc}') print(f"\n生成完成!成功写入 {tasks_completed} 个任务数据至:{OUTPUT_FILE}")
def test_calling(): client = openai.OpenAI(api_key="EMPTY", base_url="http://127.0.0.1:18889/v1") response = client.chat.completions.create( model=OPENAI_MODEL, messages=[{"role": "user", "content": "Introduce yourself"}], temperature=0.7, max_tokens=1024, ) print(response)
def get_file_data(line: str): line_data = json.loads(line) return { "instruction": "Solving the problems below, remember to draft a planning in chinese, and then finish your sub-tasks step-by-step", "input": line_data["instruction"]["prompt"], "output": line_data["output"] }
def get_json_file_path(): file_path = "./data/planning_coding.jsonl" with open(file_path, "r", encoding="utf-8") as file: data = [get_file_data(line=line) for line in file] with open("./data/planning_coding.json", "w", encoding="utf-8") as file: json.dump(data, file, ensure_ascii=False, indent=2)
if __name__ == "__main__": get_json_file_path()
|