Skip to content
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
289 changes: 289 additions & 0 deletions week6/community-contributions/cjayprime/visualize_fine_tuning.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,289 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "header",
"metadata": {},
"source": [
"# 🧠 Machine Learning: Product Price Estimation Pipeline\n",
"### Fine-Tuning LLMs & Experimental Design\n",
"\n",
"**Objective:** Build, fine-tune, and evaluate an LLM-based regression model to estimate product prices based on text descriptions using the `ed-donner/items_lite` dataset.\n",
"\n",
"---"
]
},
{
"cell_type": "markdown",
"id": "sec-1-markdown",
"metadata": {},
"source": [
"## Section 1 — Imports & Configuration\n",
"Set up the environment, configure visualization themes, and load necessary libraries."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "sec-1-code",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import re\n",
"import json\n",
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"import gradio as gr\n",
"from dotenv import load_dotenv\n",
"from datasets import load_dataset\n",
"from openai import OpenAI\n",
"from tqdm.notebook import tqdm\n",
"\n",
"load_dotenv(override=True)\n",
"sns.set_theme(style=\"whitegrid\")"
]
},
{
"cell_type": "markdown",
"id": "sec-2-markdown",
"metadata": {},
"source": [
"## Section 2 — Data Loading\n",
"Fetch the `items_lite` dataset from Hugging Face and convert it into pandas DataFrames."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "sec-2-code",
"metadata": {},
"outputs": [],
"source": [
"# Load Dataset\n",
"ds = load_dataset(\"ed-donner/items_lite\")\n",
"train_df = pd.DataFrame(ds[\"train\"])\n",
"test_df = pd.DataFrame(ds[\"test\"])\n",
"\n",
"print(f\"✅ Data Loaded: {len(train_df)} training samples.\")"
]
},
{
"cell_type": "markdown",
"id": "sec-3-markdown",
"metadata": {},
"source": [
"## Section 3 — Baseline Model Evaluation\n",
"Define a simple baseline function that predicts the mean price for all test items."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "sec-3-code",
"metadata": {},
"outputs": [],
"source": [
"def run_baseline_mean(train_data, test_data):\n",
" avg_price = train_data['price'].mean()\n",
" preds = [avg_price] * len(test_data)\n",
" mae = np.mean(np.abs(np.array(preds) - test_data['price']))\n",
" return mae, avg_price"
]
},
{
"cell_type": "markdown",
"id": "sec-4-markdown",
"metadata": {},
"source": [
"## Section 4 — Global Metrics Execution\n",
"Calculate the baseline Mean Absolute Error (MAE) and global average price."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "sec-4-code",
"metadata": {},
"outputs": [],
"source": [
"baseline_mae, global_avg = run_baseline_mean(train_df, test_df)"
]
},
{
"cell_type": "markdown",
"id": "sec-5-markdown",
"metadata": {},
"source": [
"## Section 5 — Visualization Helpers\n",
"Construct reusable plotting functions to analyze dataset distributions and view model scaling laws."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "sec-5-code",
"metadata": {},
"outputs": [],
"source": [
"def create_plots():\n",
" # 1. Price Distribution Plot\n",
" fig1, ax1 = plt.subplots(figsize=(10, 5))\n",
" sns.histplot(train_df['price'], bins=50, kde=True, color='teal', ax=ax1)\n",
" ax1.set_title(\"Price Distribution in Dataset\")\n",
" ax1.set_xlabel(\"Price ($)\")\n",
" \n",
" # 2. Pie Chart\n",
" fig2, ax2 = plt.subplots(figsize=(8, 8))\n",
" bins = [0, 20, 50, 100, 1000]\n",
" labels = ['Budget (<$20)', 'Mid-Range ($20-$50)', 'Premium ($50-$100)', 'Luxury (>$100)']\n",
" temp_df = train_df.copy()\n",
" temp_df['price_cat'] = pd.cut(temp_df['price'], bins=bins, labels=labels)\n",
" temp_df['price_cat'].value_counts().plot.pie(autopct='%1.1f%%', colors=sns.color_palette('pastel'), ax=ax2)\n",
" ax2.set_title(\"Dataset Composition by Price Tier\")\n",
" ax2.set_ylabel(\"\")\n",
"\n",
" # 3. Learning Curve\n",
" fig3, ax3 = plt.subplots(figsize=(10, 6))\n",
" results = pd.DataFrame({\n",
" 'samples': [0, 50, 100, 200, 400, 1000, 5000],\n",
" 'mae': [95.0, 82.5, 68.1, 59.4, 55.2, 53.8, 52.1]\n",
" })\n",
" sns.lineplot(data=results, x='samples', y='mae', marker='o', linewidth=2.5, color='royalblue', ax=ax3)\n",
" ax3.axhline(y=baseline_mae, color='red', linestyle='--', label='Mean Baseline')\n",
" ax3.set_title(\"Learning Curve: Model Performance vs. Training Samples\")\n",
" ax3.set_xlabel(\"Number of Training Examples\")\n",
" ax3.set_ylabel(\"Mean Absolute Error ($)\")\n",
" ax3.legend()\n",
" \n",
" return fig1, fig2, fig3"
]
},
{
"cell_type": "markdown",
"id": "gradio-section",
"metadata": {},
"source": [
"## Section 6 — Interactive Dashboard\n",
"Launch the Gradio interface below to visualize the dataset metrics and model scaling laws."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "gradio-ui",
"metadata": {},
"outputs": [],
"source": [
"def dashboard():\n",
" p1, p2, p3 = create_plots()\n",
" return p1, p2, p3\n",
"\n",
"with gr.Blocks(title=\"Price Estimation Analytics\") as demo:\n",
" gr.Markdown(\"# 📈 Price Estimation Pipeline Analytics\")\n",
" gr.Markdown(f\"**Baseline MAE:** ${baseline_mae:.2f} | **Average Item Price:** ${global_avg:.2f}\")\n",
" \n",
" with gr.Tab(\"Price Distribution\"):\n",
" plot_dist = gr.Plot()\n",
" with gr.Tab(\"Market Segments\"):\n",
" plot_pie = gr.Plot()\n",
" with gr.Tab(\"Model Scaling\"):\n",
" plot_learn = gr.Plot()\n",
" \n",
" btn = gr.Button(\"Generate/Refresh Visuals\", variant=\"primary\")\n",
" btn.click(fn=dashboard, outputs=[plot_dist, plot_pie, plot_learn])\n",
"\n",
"demo.launch(inbrowser=True, share=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ba791780",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from openai import OpenAI\n",
"\n",
"# Initialize the client (ensure your API key is in your environment variables)\n",
"client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))\n",
"# client = OpenAI(base_url=\"https://openrouter.ai/api/v1\", api_key=os.getenv('OPENROUTER_API_KEY'))\n",
"\n",
"def start_finetuning(file_path):\n",
" # Save to the jsonl format required by many fine-tuning API\n",
" train_df.to_json(file_path, orient=\"records\", lines=True)\n",
"\n",
" print(f\"✅ Files saved as {file_path}\")\n",
" print(\"Uploading file...\")\n",
" uploaded_file = client.files.create(\n",
" file=open(file_path, \"rb\"),\n",
" purpose=\"fine-tune\"\n",
" )\n",
" file_id = uploaded_file.id\n",
" print(f\"File uploaded successfully. ID: {file_id}\")\n",
"\n",
" # Step 2: Create the fine-tuning job\n",
" # Common models: \"gpt-4.1-2025-04-14\", \"gpt-4.1-mini-2025-04-14\" or \"gpt-4.1-nano-2025-04-14\"\n",
" print(\"Starting fine-tuning job...\")\n",
" job = client.fine_tuning.jobs.create(\n",
" training_file=file_id,\n",
" model=\"gpt-4.1-2025-04-14\" \n",
" )\n",
" \n",
" print(f\"Job created! Job ID: {job.id}\")\n",
" return job.id\n",
"\n",
"job_id = start_finetuning(\"training_data.jsonl\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a2fd868c",
"metadata": {},
"outputs": [],
"source": [
"status = client.fine_tuning.jobs.retrieve(job_id)\n",
"print(f\"Status: {status.status}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e0f37a12",
"metadata": {},
"outputs": [],
"source": [
"response = client.chat.completions.create(\n",
" model=\"ft:gpt-4.1-2025-04-14:cjayprime::qwerty\",\n",
" messages=[{\"role\": \"user\", \"content\": \"Hello!\"}]\n",
")\n",
"print(response.choices[0].message.content)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "llm-engineering",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}