ed-donner · cjayprime · Mar 5, 2026 · Mar 5, 2026
diff --git a/week6/community-contributions/cjayprime/visualize_fine_tuning.ipynb b/week6/community-contributions/cjayprime/visualize_fine_tuning.ipynb
@@ -0,0 +1,289 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "header",
+   "metadata": {},
+   "source": [
+    "# 🧠 Machine Learning: Product Price Estimation Pipeline\n",
+    "### Fine-Tuning LLMs & Experimental Design\n",
+    "\n",
+    "**Objective:** Build, fine-tune, and evaluate an LLM-based regression model to estimate product prices based on text descriptions using the `ed-donner/items_lite` dataset.\n",
+    "\n",
+    "---"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "sec-1-markdown",
+   "metadata": {},
+   "source": [
+    "## Section 1 — Imports & Configuration\n",
+    "Set up the environment, configure visualization themes, and load necessary libraries."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "sec-1-code",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import re\n",
+    "import json\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "import gradio as gr\n",
+    "from dotenv import load_dotenv\n",
+    "from datasets import load_dataset\n",
+    "from openai import OpenAI\n",
+    "from tqdm.notebook import tqdm\n",
+    "\n",
+    "load_dotenv(override=True)\n",
+    "sns.set_theme(style=\"whitegrid\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "sec-2-markdown",
+   "metadata": {},
+   "source": [
+    "## Section 2 — Data Loading\n",
+    "Fetch the `items_lite` dataset from Hugging Face and convert it into pandas DataFrames."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "sec-2-code",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load Dataset\n",
+    "ds = load_dataset(\"ed-donner/items_lite\")\n",
+    "train_df = pd.DataFrame(ds[\"train\"])\n",
+    "test_df = pd.DataFrame(ds[\"test\"])\n",
+    "\n",
+    "print(f\"✅ Data Loaded: {len(train_df)} training samples.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "sec-3-markdown",
+   "metadata": {},
+   "source": [
+    "## Section 3 — Baseline Model Evaluation\n",
+    "Define a simple baseline function that predicts the mean price for all test items."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "sec-3-code",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def run_baseline_mean(train_data, test_data):\n",
+    "    avg_price = train_data['price'].mean()\n",
+    "    preds = [avg_price] * len(test_data)\n",
+    "    mae = np.mean(np.abs(np.array(preds) - test_data['price']))\n",
+    "    return mae, avg_price"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "sec-4-markdown",
+   "metadata": {},
+   "source": [
+    "## Section 4 — Global Metrics Execution\n",
+    "Calculate the baseline Mean Absolute Error (MAE) and global average price."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "sec-4-code",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "baseline_mae, global_avg = run_baseline_mean(train_df, test_df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "sec-5-markdown",
+   "metadata": {},
+   "source": [
+    "## Section 5 — Visualization Helpers\n",
+    "Construct reusable plotting functions to analyze dataset distributions and view model scaling laws."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "sec-5-code",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_plots():\n",
+    "    # 1. Price Distribution Plot\n",
+    "    fig1, ax1 = plt.subplots(figsize=(10, 5))\n",
+    "    sns.histplot(train_df['price'], bins=50, kde=True, color='teal', ax=ax1)\n",
+    "    ax1.set_title(\"Price Distribution in Dataset\")\n",
+    "    ax1.set_xlabel(\"Price ($)\")\n",
+    "    \n",
+    "    # 2. Pie Chart\n",
+    "    fig2, ax2 = plt.subplots(figsize=(8, 8))\n",
+    "    bins = [0, 20, 50, 100, 1000]\n",
+    "    labels = ['Budget (<$20)', 'Mid-Range ($20-$50)', 'Premium ($50-$100)', 'Luxury (>$100)']\n",
+    "    temp_df = train_df.copy()\n",
+    "    temp_df['price_cat'] = pd.cut(temp_df['price'], bins=bins, labels=labels)\n",
+    "    temp_df['price_cat'].value_counts().plot.pie(autopct='%1.1f%%', colors=sns.color_palette('pastel'), ax=ax2)\n",
+    "    ax2.set_title(\"Dataset Composition by Price Tier\")\n",
+    "    ax2.set_ylabel(\"\")\n",
+    "\n",
+    "    # 3. Learning Curve\n",
+    "    fig3, ax3 = plt.subplots(figsize=(10, 6))\n",
+    "    results = pd.DataFrame({\n",
+    "        'samples': [0, 50, 100, 200, 400, 1000, 5000],\n",
+    "        'mae': [95.0, 82.5, 68.1, 59.4, 55.2, 53.8, 52.1]\n",
+    "    })\n",
+    "    sns.lineplot(data=results, x='samples', y='mae', marker='o', linewidth=2.5, color='royalblue', ax=ax3)\n",
+    "    ax3.axhline(y=baseline_mae, color='red', linestyle='--', label='Mean Baseline')\n",
+    "    ax3.set_title(\"Learning Curve: Model Performance vs. Training Samples\")\n",
+    "    ax3.set_xlabel(\"Number of Training Examples\")\n",
+    "    ax3.set_ylabel(\"Mean Absolute Error ($)\")\n",
+    "    ax3.legend()\n",
+    "    \n",
+    "    return fig1, fig2, fig3"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "gradio-section",
+   "metadata": {},
+   "source": [
+    "## Section 6 — Interactive Dashboard\n",
+    "Launch the Gradio interface below to visualize the dataset metrics and model scaling laws."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "gradio-ui",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def dashboard():\n",
+    "    p1, p2, p3 = create_plots()\n",
+    "    return p1, p2, p3\n",
+    "\n",
+    "with gr.Blocks(title=\"Price Estimation Analytics\") as demo:\n",
+    "    gr.Markdown(\"# 📈 Price Estimation Pipeline Analytics\")\n",
+    "    gr.Markdown(f\"**Baseline MAE:** ${baseline_mae:.2f} | **Average Item Price:** ${global_avg:.2f}\")\n",
+    "    \n",
+    "    with gr.Tab(\"Price Distribution\"):\n",
+    "        plot_dist = gr.Plot()\n",
+    "    with gr.Tab(\"Market Segments\"):\n",
+    "        plot_pie = gr.Plot()\n",
+    "    with gr.Tab(\"Model Scaling\"):\n",
+    "        plot_learn = gr.Plot()\n",
+    "    \n",
+    "    btn = gr.Button(\"Generate/Refresh Visuals\", variant=\"primary\")\n",
+    "    btn.click(fn=dashboard, outputs=[plot_dist, plot_pie, plot_learn])\n",
+    "\n",
+    "demo.launch(inbrowser=True, share=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ba791780",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from openai import OpenAI\n",
+    "\n",
+    "# Initialize the client (ensure your API key is in your environment variables)\n",
+    "client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))\n",
+    "# client = OpenAI(base_url=\"https://openrouter.ai/api/v1\", api_key=os.getenv('OPENROUTER_API_KEY'))\n",
+    "\n",
+    "def start_finetuning(file_path):\n",
+    "    # Save to the jsonl format required by many fine-tuning API\n",
+    "    train_df.to_json(file_path, orient=\"records\", lines=True)\n",
+    "\n",
+    "    print(f\"✅ Files saved as {file_path}\")\n",
+    "    print(\"Uploading file...\")\n",
+    "    uploaded_file = client.files.create(\n",
+    "        file=open(file_path, \"rb\"),\n",
+    "        purpose=\"fine-tune\"\n",
+    "    )\n",
+    "    file_id = uploaded_file.id\n",
+    "    print(f\"File uploaded successfully. ID: {file_id}\")\n",
+    "\n",
+    "    # Step 2: Create the fine-tuning job\n",
+    "    # Common models: \"gpt-4.1-2025-04-14\", \"gpt-4.1-mini-2025-04-14\" or \"gpt-4.1-nano-2025-04-14\"\n",
+    "    print(\"Starting fine-tuning job...\")\n",
+    "    job = client.fine_tuning.jobs.create(\n",
+    "        training_file=file_id,\n",
+    "        model=\"gpt-4.1-2025-04-14\" \n",
+    "    )\n",
+    "    \n",
+    "    print(f\"Job created! Job ID: {job.id}\")\n",
+    "    return job.id\n",
+    "\n",
+    "job_id = start_finetuning(\"training_data.jsonl\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a2fd868c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "status = client.fine_tuning.jobs.retrieve(job_id)\n",
+    "print(f\"Status: {status.status}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e0f37a12",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response = client.chat.completions.create(\n",
+    "    model=\"ft:gpt-4.1-2025-04-14:cjayprime::qwerty\",\n",
+    "    messages=[{\"role\": \"user\", \"content\": \"Hello!\"}]\n",
+    ")\n",
+    "print(response.choices[0].message.content)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "llm-engineering",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}