LoRA Fine-Tuning Starter
Fine-tune any Hugging Face model using LoRA with minimal VRAM. Complete script with dataset preparation, training, and inference.
Description
Train your own custom model without needing 80GB GPUs. LoRA (Low-Rank Adaptation) lets you fine-tune large models by training only small adapter weights.
Requirements
pip install torch transformers peft datasets accelerate bitsandbytes trl
Hardware: 8GB+ VRAM GPU (RTX 3060 or better), or use Google Colab.
train_lora.py
#!/usr/bin/env python3
"""
LoRA Fine-Tuning Script
Usage: python train_lora.py --model meta-llama/Llama-3.2-1B --dataset data.jsonl
"""
import argparse
import torch
from datasets import load_dataset
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
TrainingArguments,
BitsAndBytesConfig,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
# ════════════════════════════════════════════════════════════════════════════
# Configuration
# ════════════════════════════════════════════════════════════════════════════
DEFAULT_MODEL = "meta-llama/Llama-3.2-1B"
OUTPUT_DIR = "./lora-output"
# LoRA hyperparameters
LORA_CONFIG = {
"r": 16, # Rank - higher = more capacity, more VRAM
"lora_alpha": 32, # Scaling factor
"lora_dropout": 0.05, # Dropout for regularization
"bias": "none",
"task_type": "CAUSAL_LM",
"target_modules": [ # Which layers to adapt
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"
],
}
# Training hyperparameters
TRAINING_CONFIG = {
"per_device_train_batch_size": 4,
"gradient_accumulation_steps": 4,
"num_train_epochs": 3,
"learning_rate": 2e-4,
"warmup_ratio": 0.03,
"lr_scheduler_type": "cosine",
"logging_steps": 10,
"save_strategy": "epoch",
"fp16": True, # Use bf16=True for Ampere+ GPUs
"optim": "paged_adamw_8bit",
}
# ════════════════════════════════════════════════════════════════════════════
# Model Loading
# ════════════════════════════════════════════════════════════════════════════
def load_model_and_tokenizer(model_name: str, use_4bit: bool = True):
"""Load model with optional 4-bit quantization."""
# Quantization config for low VRAM
bnb_config = None
if use_4bit:
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
)
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
# Load model
model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=bnb_config,
device_map="auto",
trust_remote_code=True,
)
if use_4bit:
model = prepare_model_for_kbit_training(model)
return model, tokenizer
# ════════════════════════════════════════════════════════════════════════════
# Dataset Preparation
# ════════════════════════════════════════════════════════════════════════════
def format_instruction(sample: dict) -> str:
"""Format a sample into instruction format."""
# Adjust based on your data structure
if "instruction" in sample and "response" in sample:
return f"""### Instruction:
{sample['instruction']}
### Response:
{sample['response']}"""
elif "prompt" in sample and "completion" in sample:
return f"""{sample['prompt']}{sample['completion']}"""
elif "text" in sample:
return sample["text"]
else:
raise ValueError(f"Unknown data format: {sample.keys()}")
def load_and_format_dataset(data_path: str, tokenizer, max_length: int = 512):
"""Load dataset and format for training."""
# Load dataset
if data_path.endswith(".jsonl") or data_path.endswith(".json"):
dataset = load_dataset("json", data_files=data_path, split="train")
elif data_path.endswith(".csv"):
dataset = load_dataset("csv", data_files=data_path, split="train")
else:
# Assume it's a HuggingFace dataset
dataset = load_dataset(data_path, split="train")
return dataset
# ════════════════════════════════════════════════════════════════════════════
# Training
# ════════════════════════════════════════════════════════════════════════════
def train(model_name: str, data_path: str, output_dir: str = OUTPUT_DIR):
"""Run the fine-tuning process."""
print(f"🚀 Loading model: {model_name}")
model, tokenizer = load_model_and_tokenizer(model_name)
print(f"📊 Loading dataset: {data_path}")
dataset = load_and_format_dataset(data_path, tokenizer)
print(f" Found {len(dataset)} samples")
# Apply LoRA
print("🔧 Applying LoRA configuration...")
lora_config = LoraConfig(**LORA_CONFIG)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
# Training arguments
training_args = TrainingArguments(
output_dir=output_dir,
**TRAINING_CONFIG
)
# Trainer
trainer = SFTTrainer(
model=model,
train_dataset=dataset,
tokenizer=tokenizer,
args=training_args,
formatting_func=format_instruction,
max_seq_length=512,
packing=False,
)
# Train!
print("🏋️ Starting training...")
trainer.train()
# Save
print(f"💾 Saving to {output_dir}")
trainer.save_model()
tokenizer.save_pretrained(output_dir)
print("✅ Done!")
# ════════════════════════════════════════════════════════════════════════════
# Main
# ════════════════════════════════════════════════════════════════════════════
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Fine-tune a model with LoRA")
parser.add_argument("--model", default=DEFAULT_MODEL, help="Base model name")
parser.add_argument("--dataset", required=True, help="Training data path")
parser.add_argument("--output", default=OUTPUT_DIR, help="Output directory")
args = parser.parse_args()
train(args.model, args.dataset, args.output)
inference.py - Use Your Model
#!/usr/bin/env python3
"""
Inference with fine-tuned LoRA model
Usage: python inference.py --adapter ./lora-output "Your prompt"
"""
import argparse
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
def load_lora_model(base_model: str, adapter_path: str):
"""Load base model with LoRA adapter."""
tokenizer = AutoTokenizer.from_pretrained(adapter_path)
model = AutoModelForCausalLM.from_pretrained(
base_model,
torch_dtype=torch.float16,
device_map="auto",
)
model = PeftModel.from_pretrained(model, adapter_path)
model = model.merge_and_unload() # Optional: merge for faster inference
return model, tokenizer
def generate(model, tokenizer, prompt: str, max_tokens: int = 256):
"""Generate text with the model."""
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=max_tokens,
temperature=0.7,
top_p=0.9,
do_sample=True,
pad_token_id=tokenizer.eos_token_id,
)
return tokenizer.decode(outputs[0], skip_special_tokens=True)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("prompt", help="Input prompt")
parser.add_argument("--base-model", default="meta-llama/Llama-3.2-1B")
parser.add_argument("--adapter", default="./lora-output")
args = parser.parse_args()
print("Loading model...")
model, tokenizer = load_lora_model(args.base_model, args.adapter)
print("Generating...")
output = generate(model, tokenizer, args.prompt)
print(output)
Dataset Format
Your training data should be in JSONL format:
{"instruction": "Write a haiku about programming", "response": "Code flows like water\nBugs emerge from the shadows\nCoffee saves the day"}
{"instruction": "Explain recursion", "response": "To understand recursion, you must first understand recursion. Just kidding - it's when a function calls itself to solve smaller subproblems."}
Or simple prompt/completion:
{"prompt": "Q: What is the capital of France?\nA:", "completion": " Paris"}
{"prompt": "Q: What is 2+2?\nA:", "completion": " 4"}
Usage
# 1. Prepare your data
cat > data.jsonl << 'EOF'
{"instruction": "Say hello", "response": "Hello! How can I help you today?"}
{"instruction": "Tell a joke", "response": "Why do programmers prefer dark mode? Because light attracts bugs!"}
EOF
# 2. Train (takes 10-60 min depending on GPU and data size)
python train_lora.py --model meta-llama/Llama-3.2-1B --dataset data.jsonl
# 3. Inference
python inference.py --adapter ./lora-output "Tell me a joke"
Tips
- Start small: Use a 1B-3B model first, scale up once working
- Quality > Quantity: 1000 high-quality samples beats 100k noisy ones
- Learning rate: Lower (1e-5) for larger models, higher (2e-4) for small
- Rank (r): 8-16 is usually enough; increase if underfitting
- Epochs: 1-3 epochs; more can cause overfitting
- Gradient checkpointing: Enable if running out of VRAM