RedHatTraining · jbenignocardoso · Mar 2, 2026 · Mar 3, 2026 · Mar 3, 2026 · Mar 3, 2026
diff --git a/model/compression/compress_granite.ipynb b/model/compression/compress_granite.ipynb
@@ -0,0 +1,237 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "intro",
+   "metadata": {},
+   "source": [
+    "# Model Compression with LLM Compressor\n",
+    "\n",
+    "Compress the IBM Granite 4.0 350M model using INT8 W8A16 quantization."
-    "Compress the IBM Granite 4.0 350M model using INT8 W8A16 quantization."
+    "Compress the IBM Granite 4.0 350M model by using INT8 W8A16 quantization."
-    "Compress the IBM Granite 4.0 350M model using INT8 W8A16 quantization."
+    "Compress the IBM Granite 4.0 350M model by using INT8 W8A16 quantization."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "import-section",
+   "metadata": {},
+   "source": [
+    "### 1. Import libraries"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "imports",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
+    "from llmcompressor import oneshot\n",
+    "from llmcompressor.modifiers.quantization import QuantizationModifier\n",
+    "from accelerate import dispatch_model\n",
+    "from accelerate.utils.modeling import infer_auto_device_map\n",
+    "from pathlib import Path\n",
+    "\n",
+    "print(\"✓ Libraries imported\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "config-section",
+   "metadata": {},
+   "source": [
+    "### 2. Configure paths"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "config",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "MODEL_PATH = \"/shared-models/granite-4.0-350m\"\n",
+    "SAVE_DIR = \"granite-INT8\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "load-section",
+   "metadata": {},
+   "source": [
+    "### 3. Load model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "load-model",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "# Load model from shared storage\n",
+    "model = AutoModelForCausalLM.from_pretrained(\n",
+    "    os.path.abspath(MODEL_PATH),\n",
+    "    torch_dtype=\"auto\",\n",
+    "    low_cpu_mem_usage=True,\n",
+    "    local_files_only=True,\n",
+    "    trust_remote_code=False\n",
+    ")\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\n",
+    "    os.path.abspath(MODEL_PATH),\n",
+    "    local_files_only=True,\n",
+    "    trust_remote_code=False\n",
+    ")\n",
+    "\n",
+    "print(f\"✓ Model loaded: {sum(p.numel() for p in model.parameters()) / 1e6:.0f}M parameters\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "test-original-section",
+   "metadata": {},
+   "source": [
+    "### 4. Test original model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "test-original",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_prompt = \"The capital of France is\"\n",
+    "inputs = tokenizer(test_prompt, return_tensors=\"pt\")\n",
+    "outputs = model.generate(**inputs, max_new_tokens=20, do_sample=False)\n",
+    "original_text = tokenizer.decode(outputs[0], skip_special_tokens=True)\n",
+    "\n",
+    "print(f\"Original output: '{original_text}'\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "quantize-section",
+   "metadata": {},
+   "source": [
+    "### 5. Apply quantization\n",
+    "\n",
+    "Configure the quantization recipe and apply it to the model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "quantize",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Configure quantization recipe\n",
+    "recipe = QuantizationModifier(\n",
+    "    targets=\"Linear\",     # Apply to all linear layers\n",
+    "    scheme=\"W8A16\",       # INT8 weights, FP16 activations\n",
+    "    ignore=[]             # Quantize all layers (including lm_head)\n",
+    ")\n",
+    "\n",
+    "# Apply one-shot quantization\n",
+    "oneshot(model=model, recipe=recipe)\n",
+    "print(\"✓ Quantization complete\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "test-quantized-section",
+   "metadata": {},
+   "source": [
+    "### 6. Test quantized model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "test-quantized",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "device_map = infer_auto_device_map(model)\n",
+    "model = dispatch_model(model, device_map)\n",
+    "\n",
+    "inputs = tokenizer(test_prompt, return_tensors=\"pt\")\n",
+    "input_ids = inputs.input_ids.to(model.device)\n",
+    "outputs = model.generate(input_ids, max_new_tokens=20, do_sample=False)\n",
+    "quantized_text = tokenizer.decode(outputs[0], skip_special_tokens=True)\n",
+    "\n",
+    "print(f\"Quantized output: '{quantized_text}'\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "save-section",
+   "metadata": {},
+   "source": [
+    "### 7. Save compressed model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "save-model",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Path(SAVE_DIR).mkdir(parents=True, exist_ok=True)\n",
+    "model.save_pretrained(SAVE_DIR)\n",
+    "tokenizer.save_pretrained(SAVE_DIR)\n",
+    "\n",
+    "print(f\"✓ Model saved to {SAVE_DIR}/\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "compare-section",
+   "metadata": {},
+   "source": [
+    "### 8. Compare file sizes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "compare-sizes",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import subprocess\n",
+    "\n",
+    "def get_size_mb(path):\n",
+    "    result = subprocess.run([\"du\", \"-sm\", path], capture_output=True, text=True)\n",
+    "    return int(result.stdout.split()[0])\n",
+    "\n",
+    "original_size = get_size_mb(MODEL_PATH)\n",
+    "compressed_size = get_size_mb(SAVE_DIR)\n",
+    "reduction = ((original_size - compressed_size) / original_size) * 100\n",
+    "\n",
+    "print(\"=\"*50)\n",
+    "print(f\"Original:    {original_size:>6} MB\")\n",
+    "print(f\"Compressed:  {compressed_size:>6} MB\")\n",
+    "print(f\"Reduction:   {reduction:>6.1f}%\")\n",
+    "print(\"=\"*50)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.11.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}