Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,382 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "6fbf66d6",
"metadata": {},
"source": [
"## Synthetic Data Generator\n",
"This Data generator uses a large language model and a Gradio interface to allows users to dynamically generate realistic tabular datasets in CSV format based on a specified topic, column structure, and number of rows. It's designed to run on Colab only.\n",
"\n",
"The application constructs a structured prompt that instructs the model to output strictly formatted CSV data, which is then parsed into a pandas DataFrame for preview and download. The generated dataset can be exported as a CSV file for further analysis or experimentation.\n",
"\n",
"Key Features\n",
"\n",
"- Generate synthetic datasets from natural language input\n",
"\n",
"- Customizable topic, columns, and dataset size\n",
"\n",
"- Automatic CSV validation and parsing\n",
"\n",
"- Data preview within a Gradio UI\n",
"\n",
"- One-click CSV download\n",
"\n",
"- Designed for rapid prototyping and testing ML/data workflows\n",
"\n",
"Use Cases\n",
"\n",
"- Testing data pipelines\n",
"\n",
"- Creating mock datasets\n",
"\n",
"- Prototyping machine learning experiments\n",
"\n",
"- Educational and demonstration purposes"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "225cf141",
"metadata": {},
"outputs": [],
"source": [
"!pip install -q transformers accelerate bitsandbytes gradio pandas"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "878de128",
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"import pandas as pd\n",
"import gradio as gr\n",
"import gc\n",
"import os\n",
"from datetime import datetime\n",
"\n",
"from google.colab import userdata \n",
"from huggingface_hub import login\n",
"from transformers import (\n",
" AutoTokenizer,\n",
" AutoModelForCausalLM,\n",
" TextStreamer,\n",
" BitsAndBytesConfig\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0c638185",
"metadata": {},
"outputs": [],
"source": [
"hf_token = userdata.get(\"HF_TOKEN\")\n",
"login(hf_token)\n",
"\n",
"print(\"GPU available:\", torch.cuda.is_available())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "56e0bec6",
"metadata": {},
"outputs": [],
"source": [
"MODEL = \"microsoft/Phi-3-mini-4k-instruct\" #you can change the model to any other model"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ec56247e",
"metadata": {},
"outputs": [],
"source": [
"quant_config = BitsAndBytesConfig(\n",
" load_in_4bit=True,\n",
" bnb_4bit_compute_dtype=torch.bfloat16,\n",
" bnb_4bit_quant_type=\"nf4\"\n",
")\n",
"\n",
"tokenizer = AutoTokenizer.from_pretrained(MODEL)\n",
"model = AutoModelForCausalLM.from_pretrained(\n",
" MODEL,\n",
" device_map=\"auto\",\n",
" quantization_config=quant_config\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "666c7736",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "dbbeec44",
"metadata": {},
"outputs": [],
"source": [
"def prompt_builder(topic, columns, rows):\n",
"\n",
" system_prompt = \"\"\"\n",
"You are a synthetic data generator.\n",
"Generate realistic datasets only.\n",
"Output ONLY CSV data.\n",
"No explanations.\n",
"\"\"\"\n",
"\n",
" example = \"\"\"\n",
"Example CSV:\n",
"\n",
"id,name,species,category\n",
"1,Lion,Mammal,Savannah\n",
"2,Penguin,Bird,Antarctic\n",
"3,Elephant,Mammal,Forest\n",
"\"\"\"\n",
"\n",
" user = f\"\"\"\n",
"Create a synthetic dataset.\n",
"\n",
"Topic: {topic}\n",
"Columns: {columns}\n",
"Rows: {rows}\n",
"\n",
"Follow the example format EXACTLY.\n",
"Return ONLY CSV text.\n",
"\"\"\"\n",
"\n",
" return system_prompt + example + user"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1514ced1",
"metadata": {},
"outputs": [],
"source": [
"def generate_dataset(topic, columns, rows):\n",
" prompt = prompt_builder(topic, columns, rows)\n",
"\n",
" inputs = tokenizer(prompt, return_tensors=\"pt\").to(\"cuda\")\n",
"\n",
" output = model.generate(\n",
" **inputs,\n",
" max_new_tokens=2048,\n",
" do_sample=True,\n",
" temperature=0.7,\n",
" pad_token_id=tokenizer.eos_token_id\n",
" )\n",
"\n",
" generated_tokens = output[0][inputs.input_ids.shape[-1]:]\n",
"\n",
" text = tokenizer.decode(generated_tokens, skip_special_tokens=True)\n",
"\n",
" return text.strip()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b2235727",
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"from io import StringIO\n",
"import pandas as pd\n",
"\n",
"def clean_csv_text(csv_text):\n",
"\n",
" lines = [l.strip() for l in csv_text.splitlines() if l.strip()]\n",
"\n",
" \n",
" header_idx = next((i for i,l in enumerate(lines) if \",\" in l), None)\n",
"\n",
" if header_idx is None:\n",
" raise ValueError(\"No CSV header detected\")\n",
"\n",
" header = lines[header_idx]\n",
" num_cols = len(header.split(\",\"))\n",
"\n",
" valid_lines = [header]\n",
"\n",
" for line in lines[header_idx+1:]:\n",
" if len(line.split(\",\")) == num_cols:\n",
" valid_lines.append(line)\n",
"\n",
" return \"\\n\".join(valid_lines)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "eca36c97",
"metadata": {},
"outputs": [],
"source": [
"def csv_to_dataframe_converter(csv_text):\n",
" csv_text = clean_csv_text(csv_text)\n",
" try:\n",
" df = pd.read_csv(StringIO(csv_text), sep=\",\")\n",
" except Exception as e:\n",
" raise ValueError(f\"Failed to parse CSV: {e}\\nPreview:\\n{csv_text[:500]}\")\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "092dbfc5",
"metadata": {},
"outputs": [],
"source": [
"from google.colab import drive\n",
"drive.mount(\"/content/drive\", force_remount=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "156a42f2",
"metadata": {},
"outputs": [],
"source": [
"def save_dataset(df, filename_prifix=\"synthetic_data\"):\n",
" folder_path = \"/content/drive/MyDrive/llms/datasets\"\n",
" os.makedirs(folder_path, exist_ok=True)\n",
" timestamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n",
" filename = f\"{filename_prifix}_{timestamp}.csv\"\n",
" path = os.path.join(folder_path, filename)\n",
" df.to_csv(path, index=False)\n",
" print(f\"Saved CSV to {path}\")\n",
" return path"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ff704dbe",
"metadata": {},
"outputs": [],
"source": [
"def app(topic, columns, rows):\n",
" columns_list = [c.strip() for c in columns.split(\",\") if c.strip()]\n",
"\n",
" csv_text = generate_dataset(topic, columns_list, rows)\n",
"\n",
" try:\n",
" df = csv_to_dataframe_converter(csv_text)\n",
" except ValueError as e:\n",
" return str(e), None\n",
"\n",
" # Save to Drive\n",
" file_path = save_dataset(df)\n",
"\n",
" return df,file_path"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e92a2634",
"metadata": {},
"outputs": [],
"source": [
"with gr.Blocks(theme=gr.themes.Soft(), title=\"Synthetic Data Generator\") as demo:\n",
"\n",
" gr.Markdown(\n",
" \"\"\"\n",
" # Synthetic Data Generator\n",
"\n",
" Generate realistic synthetic datasets instantly using AI.\n",
"\n",
" **Steps**\n",
" 1. Enter a dataset topic \n",
" 2. Specify columns (comma separated) \n",
" 3. Choose number of rows \n",
" 4. Click **Generate Dataset**\n",
" \"\"\"\n",
" )\n",
"\n",
" with gr.Row():\n",
" with gr.Column(scale=1):\n",
"\n",
" topic = gr.Textbox(\n",
" label=\"Dataset Topic\",\n",
" placeholder=\"e.g. Ecommerce products, Climate data, Customer churn\"\n",
" )\n",
"\n",
" columns = gr.Textbox(\n",
" label=\"Columns\",\n",
" placeholder=\"id, name, category, price\"\n",
" )\n",
"\n",
" rows = gr.Slider(\n",
" minimum=5,\n",
" maximum=200,\n",
" value=20,\n",
" step=1,\n",
" label=\"Number of Rows\"\n",
" )\n",
"\n",
" generate_btn = gr.Button(\n",
" \" Generate Dataset\",\n",
" variant=\"primary\"\n",
" )\n",
"\n",
" with gr.Column(scale=2):\n",
" preview = gr.Dataframe(\n",
" label=\"Dataset Preview\",\n",
" interactive=False\n",
" )\n",
"\n",
" download = gr.File(\n",
" label=\"⬇️ Download CSV\"\n",
" )\n",
"\n",
" generate_btn.click(\n",
" fn=app,\n",
" inputs=[topic, columns, rows],\n",
" outputs=[preview, download]\n",
" )\n",
"\n",
"demo.launch()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}