Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
237 changes: 237 additions & 0 deletions model/compression/compress_granite.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,237 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "intro",
"metadata": {},
"source": [
"# Model Compression with LLM Compressor\n",
"\n",
"Compress the IBM Granite 4.0 350M model using INT8 W8A16 quantization."
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
"Compress the IBM Granite 4.0 350M model using INT8 W8A16 quantization."
"Compress the IBM Granite 4.0 350M model by using INT8 W8A16 quantization."

]
},
{
"cell_type": "markdown",
"id": "import-section",
"metadata": {},
"source": [
"### 1. Import libraries"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "imports",
"metadata": {},
"outputs": [],
"source": [
"from transformers import AutoModelForCausalLM, AutoTokenizer\n",
"from llmcompressor import oneshot\n",
"from llmcompressor.modifiers.quantization import QuantizationModifier\n",
"from accelerate import dispatch_model\n",
"from accelerate.utils.modeling import infer_auto_device_map\n",
"from pathlib import Path\n",
"\n",
"print(\"✓ Libraries imported\")"
]
},
{
"cell_type": "markdown",
"id": "config-section",
"metadata": {},
"source": [
"### 2. Configure paths"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "config",
"metadata": {},
"outputs": [],
"source": [
"MODEL_PATH = \"/shared-models/granite-4.0-350m\"\n",
"SAVE_DIR = \"granite-INT8\""
]
},
{
"cell_type": "markdown",
"id": "load-section",
"metadata": {},
"source": [
"### 3. Load model"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "load-model",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"# Load model from shared storage\n",
"model = AutoModelForCausalLM.from_pretrained(\n",
" os.path.abspath(MODEL_PATH),\n",
" torch_dtype=\"auto\",\n",
" low_cpu_mem_usage=True,\n",
" local_files_only=True,\n",
" trust_remote_code=False\n",
")\n",
"tokenizer = AutoTokenizer.from_pretrained(\n",
" os.path.abspath(MODEL_PATH),\n",
" local_files_only=True,\n",
" trust_remote_code=False\n",
")\n",
"\n",
"print(f\"✓ Model loaded: {sum(p.numel() for p in model.parameters()) / 1e6:.0f}M parameters\")"
]
},
{
"cell_type": "markdown",
"id": "test-original-section",
"metadata": {},
"source": [
"### 4. Test original model"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "test-original",
"metadata": {},
"outputs": [],
"source": [
"test_prompt = \"The capital of France is\"\n",
"inputs = tokenizer(test_prompt, return_tensors=\"pt\")\n",
"outputs = model.generate(**inputs, max_new_tokens=20, do_sample=False)\n",
"original_text = tokenizer.decode(outputs[0], skip_special_tokens=True)\n",
"\n",
"print(f\"Original output: '{original_text}'\")"
]
},
{
"cell_type": "markdown",
"id": "quantize-section",
"metadata": {},
"source": [
"### 5. Apply quantization\n",
"\n",
"Configure the quantization recipe and apply it to the model."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "quantize",
"metadata": {},
"outputs": [],
"source": [
"# Configure quantization recipe\n",
"recipe = QuantizationModifier(\n",
" targets=\"Linear\", # Apply to all linear layers\n",
" scheme=\"W8A16\", # INT8 weights, FP16 activations\n",
" ignore=[] # Quantize all layers (including lm_head)\n",
")\n",
"\n",
"# Apply one-shot quantization\n",
"oneshot(model=model, recipe=recipe)\n",
"print(\"✓ Quantization complete\")"
]
},
{
"cell_type": "markdown",
"id": "test-quantized-section",
"metadata": {},
"source": [
"### 6. Test quantized model"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "test-quantized",
"metadata": {},
"outputs": [],
"source": [
"device_map = infer_auto_device_map(model)\n",
"model = dispatch_model(model, device_map)\n",
"\n",
"inputs = tokenizer(test_prompt, return_tensors=\"pt\")\n",
"input_ids = inputs.input_ids.to(model.device)\n",
"outputs = model.generate(input_ids, max_new_tokens=20, do_sample=False)\n",
"quantized_text = tokenizer.decode(outputs[0], skip_special_tokens=True)\n",
"\n",
"print(f\"Quantized output: '{quantized_text}'\")"
]
},
{
"cell_type": "markdown",
"id": "save-section",
"metadata": {},
"source": [
"### 7. Save compressed model"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "save-model",
"metadata": {},
"outputs": [],
"source": [
"Path(SAVE_DIR).mkdir(parents=True, exist_ok=True)\n",
"model.save_pretrained(SAVE_DIR)\n",
"tokenizer.save_pretrained(SAVE_DIR)\n",
"\n",
"print(f\"✓ Model saved to {SAVE_DIR}/\")"
]
},
{
"cell_type": "markdown",
"id": "compare-section",
"metadata": {},
"source": [
"### 8. Compare file sizes"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "compare-sizes",
"metadata": {},
"outputs": [],
"source": [
"import subprocess\n",
"\n",
"def get_size_mb(path):\n",
" result = subprocess.run([\"du\", \"-sm\", path], capture_output=True, text=True)\n",
" return int(result.stdout.split()[0])\n",
"\n",
"original_size = get_size_mb(MODEL_PATH)\n",
"compressed_size = get_size_mb(SAVE_DIR)\n",
"reduction = ((original_size - compressed_size) / original_size) * 100\n",
"\n",
"print(\"=\"*50)\n",
"print(f\"Original: {original_size:>6} MB\")\n",
"print(f\"Compressed: {compressed_size:>6} MB\")\n",
"print(f\"Reduction: {reduction:>6.1f}%\")\n",
"print(\"=\"*50)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.11.0"
}
},
"nbformat": 4,
"nbformat_minor": 4
}