ed-donner · victorhabila · Mar 5, 2026 · Mar 5, 2026
diff --git a/week3/community-contributions/dataset_generator_vic/generate_dataset.ipynb b/week3/community-contributions/dataset_generator_vic/generate_dataset.ipynb
@@ -0,0 +1,382 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "6fbf66d6",
+   "metadata": {},
+   "source": [
+    "## Synthetic Data Generator\n",
+    "This Data generator uses a large language model and a Gradio interface to allows users to dynamically generate realistic tabular datasets in CSV format based on a specified topic, column structure, and number of rows. It's designed to run on Colab only.\n",
+    "\n",
+    "The application constructs a structured prompt that instructs the model to output strictly formatted CSV data, which is then parsed into a pandas DataFrame for preview and download. The generated dataset can be exported as a CSV file for further analysis or experimentation.\n",
+    "\n",
+    "Key Features\n",
+    "\n",
+    "- Generate synthetic datasets from natural language input\n",
+    "\n",
+    "- Customizable topic, columns, and dataset size\n",
+    "\n",
+    "- Automatic CSV validation and parsing\n",
+    "\n",
+    "- Data preview within a Gradio UI\n",
+    "\n",
+    "- One-click CSV download\n",
+    "\n",
+    "- Designed for rapid prototyping and testing ML/data workflows\n",
+    "\n",
+    "Use Cases\n",
+    "\n",
+    "- Testing data pipelines\n",
+    "\n",
+    "- Creating mock datasets\n",
+    "\n",
+    "- Prototyping machine learning experiments\n",
+    "\n",
+    "- Educational and demonstration purposes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "225cf141",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install -q transformers accelerate bitsandbytes gradio pandas"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "878de128",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import pandas as pd\n",
+    "import gradio as gr\n",
+    "import gc\n",
+    "import os\n",
+    "from datetime import datetime\n",
+    "\n",
+    "from google.colab import userdata \n",
+    "from huggingface_hub import login\n",
+    "from transformers import (\n",
+    "    AutoTokenizer,\n",
+    "    AutoModelForCausalLM,\n",
+    "    TextStreamer,\n",
+    "    BitsAndBytesConfig\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0c638185",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hf_token = userdata.get(\"HF_TOKEN\")\n",
+    "login(hf_token)\n",
+    "\n",
+    "print(\"GPU available:\", torch.cuda.is_available())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "56e0bec6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "MODEL = \"microsoft/Phi-3-mini-4k-instruct\" #you can change the model to any other model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ec56247e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "quant_config = BitsAndBytesConfig(\n",
+    "    load_in_4bit=True,\n",
+    "    bnb_4bit_compute_dtype=torch.bfloat16,\n",
+    "    bnb_4bit_quant_type=\"nf4\"\n",
+    ")\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(MODEL)\n",
+    "model = AutoModelForCausalLM.from_pretrained(\n",
+    "    MODEL,\n",
+    "    device_map=\"auto\",\n",
+    "    quantization_config=quant_config\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "666c7736",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dbbeec44",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def prompt_builder(topic, columns, rows):\n",
+    "\n",
+    "    system_prompt = \"\"\"\n",
+    "You are a synthetic data generator.\n",
+    "Generate realistic datasets only.\n",
+    "Output ONLY CSV data.\n",
+    "No explanations.\n",
+    "\"\"\"\n",
+    "\n",
+    "    example = \"\"\"\n",
+    "Example CSV:\n",
+    "\n",
+    "id,name,species,category\n",
+    "1,Lion,Mammal,Savannah\n",
+    "2,Penguin,Bird,Antarctic\n",
+    "3,Elephant,Mammal,Forest\n",
+    "\"\"\"\n",
+    "\n",
+    "    user = f\"\"\"\n",
+    "Create a synthetic dataset.\n",
+    "\n",
+    "Topic: {topic}\n",
+    "Columns: {columns}\n",
+    "Rows: {rows}\n",
+    "\n",
+    "Follow the example format EXACTLY.\n",
+    "Return ONLY CSV text.\n",
+    "\"\"\"\n",
+    "\n",
+    "    return system_prompt + example + user"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1514ced1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def generate_dataset(topic, columns, rows):\n",
+    "    prompt = prompt_builder(topic, columns, rows)\n",
+    "\n",
+    "    inputs = tokenizer(prompt, return_tensors=\"pt\").to(\"cuda\")\n",
+    "\n",
+    "    output = model.generate(\n",
+    "        **inputs,\n",
+    "        max_new_tokens=2048,\n",
+    "        do_sample=True,\n",
+    "        temperature=0.7,\n",
+    "        pad_token_id=tokenizer.eos_token_id\n",
+    "    )\n",
+    "\n",
+    "    generated_tokens = output[0][inputs.input_ids.shape[-1]:]\n",
+    "\n",
+    "    text = tokenizer.decode(generated_tokens, skip_special_tokens=True)\n",
+    "\n",
+    "    return text.strip()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b2235727",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "from io import StringIO\n",
+    "import pandas as pd\n",
+    "\n",
+    "def clean_csv_text(csv_text):\n",
+    "\n",
+    "    lines = [l.strip() for l in csv_text.splitlines() if l.strip()]\n",
+    "\n",
+    "  \n",
+    "    header_idx = next((i for i,l in enumerate(lines) if \",\" in l), None)\n",
+    "\n",
+    "    if header_idx is None:\n",
+    "        raise ValueError(\"No CSV header detected\")\n",
+    "\n",
+    "    header = lines[header_idx]\n",
+    "    num_cols = len(header.split(\",\"))\n",
+    "\n",
+    "    valid_lines = [header]\n",
+    "\n",
+    "    for line in lines[header_idx+1:]:\n",
+    "        if len(line.split(\",\")) == num_cols:\n",
+    "            valid_lines.append(line)\n",
+    "\n",
+    "    return \"\\n\".join(valid_lines)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eca36c97",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def csv_to_dataframe_converter(csv_text):\n",
+    "    csv_text = clean_csv_text(csv_text)\n",
+    "    try:\n",
+    "        df = pd.read_csv(StringIO(csv_text), sep=\",\")\n",
+    "    except Exception as e:\n",
+    "        raise ValueError(f\"Failed to parse CSV: {e}\\nPreview:\\n{csv_text[:500]}\")\n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "092dbfc5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from google.colab import drive\n",
+    "drive.mount(\"/content/drive\", force_remount=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "156a42f2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def save_dataset(df, filename_prifix=\"synthetic_data\"):\n",
+    "    folder_path = \"/content/drive/MyDrive/llms/datasets\"\n",
+    "    os.makedirs(folder_path, exist_ok=True)\n",
+    "    timestamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n",
+    "    filename = f\"{filename_prifix}_{timestamp}.csv\"\n",
+    "    path = os.path.join(folder_path, filename)\n",
+    "    df.to_csv(path, index=False)\n",
+    "    print(f\"Saved CSV to {path}\")\n",
+    "    return path"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ff704dbe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def app(topic, columns, rows):\n",
+    "    columns_list = [c.strip() for c in columns.split(\",\") if c.strip()]\n",
+    "\n",
+    "    csv_text = generate_dataset(topic, columns_list, rows)\n",
+    "\n",
+    "    try:\n",
+    "        df = csv_to_dataframe_converter(csv_text)\n",
+    "    except ValueError as e:\n",
+    "        return str(e), None\n",
+    "\n",
+    "    # Save to Drive\n",
+    "    file_path =  save_dataset(df)\n",
+    "\n",
+    "    return df,file_path"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e92a2634",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with gr.Blocks(theme=gr.themes.Soft(), title=\"Synthetic Data Generator\") as demo:\n",
+    "\n",
+    "    gr.Markdown(\n",
+    "        \"\"\"\n",
+    "        # Synthetic Data Generator\n",
+    "\n",
+    "        Generate realistic synthetic datasets instantly using AI.\n",
+    "\n",
+    "        **Steps**\n",
+    "        1. Enter a dataset topic  \n",
+    "        2. Specify columns (comma separated)  \n",
+    "        3. Choose number of rows  \n",
+    "        4. Click **Generate Dataset**\n",
+    "        \"\"\"\n",
+    "    )\n",
+    "\n",
+    "    with gr.Row():\n",
+    "        with gr.Column(scale=1):\n",
+    "\n",
+    "            topic = gr.Textbox(\n",
+    "                label=\"Dataset Topic\",\n",
+    "                placeholder=\"e.g. Ecommerce products, Climate data, Customer churn\"\n",
+    "            )\n",
+    "\n",
+    "            columns = gr.Textbox(\n",
+    "                label=\"Columns\",\n",
+    "                placeholder=\"id, name, category, price\"\n",
+    "            )\n",
+    "\n",
+    "            rows = gr.Slider(\n",
+    "                minimum=5,\n",
+    "                maximum=200,\n",
+    "                value=20,\n",
+    "                step=1,\n",
+    "                label=\"Number of Rows\"\n",
+    "            )\n",
+    "\n",
+    "            generate_btn = gr.Button(\n",
+    "                \" Generate Dataset\",\n",
+    "                variant=\"primary\"\n",
+    "            )\n",
+    "\n",
+    "        with gr.Column(scale=2):\n",
+    "            preview = gr.Dataframe(\n",
+    "                label=\"Dataset Preview\",\n",
+    "                interactive=False\n",
+    "            )\n",
+    "\n",
+    "            download = gr.File(\n",
+    "                label=\"⬇️ Download CSV\"\n",
+    "            )\n",
+    "\n",
+    "    generate_btn.click(\n",
+    "        fn=app,\n",
+    "        inputs=[topic, columns, rows],\n",
+    "        outputs=[preview, download]\n",
+    "    )\n",
+    "\n",
+    "demo.launch()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}