A Coding Guide on LLM Post Training with TRL from Supervised Fine Tuning to DPO and GRPO Reasoning

import subprocess, sys
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "-U",
   "torchao>=0.16",
   "trl>=0.20",
   "transformers>=4.45",
   "datasets",
   "peft>=0.13",
   "accelerate",
   "bitsandbytes",
])


import sys as _sys
for _m in [m for m in list(_sys.modules) if m.startswith(("torchao", "peft"))]:
   _sys.modules.pop(_m, None)
try:
   import torchao
except Exception:
   import types
   _fake = types.ModuleType("torchao")
   _fake.__version__ = "0.16.1"
   _sys.modules["torchao"] = _fake


import os, re, gc, torch, warnings
warnings.filterwarnings("ignore")
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"


from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig


print(f"torch={torch.__version__}  cuda={torch.cuda.is_available()}")
if torch.cuda.is_available():
   print(f"GPU: {torch.cuda.get_device_name(0)}  "
         f"({torch.cuda.get_device_properties(0).total_memory/1e9:.1f} GB)")


MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
DEVICE     = "cuda" if torch.cuda.is_available() else "cpu"
BF16_OK    = torch.cuda.is_available() and torch.cuda.is_bf16_supported()


LORA_CFG = LoraConfig(
   r=8, lora_alpha=16, lora_dropout=0.05, bias="none",
   target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
   task_type="CAUSAL_LM",
)


def cleanup():
   """Release VRAM between training stages (Colab T4 is tight)."""
   gc.collect()
   if torch.cuda.is_available():
       torch.cuda.empty_cache()


def chat_generate(model, tokenizer, prompt, max_new_tokens=120):
   """Helper: format as chat, generate, decode just the assistant turn."""
   msgs = [{"role": "user", "content": prompt}]
   ids = tokenizer.apply_chat_template(
       msgs, return_tensors="pt", add_generation_prompt=True
   ).to(model.device)
   with torch.no_grad():
       out = model.generate(
           ids, max_new_tokens=max_new_tokens,
           do_sample=True, temperature=0.7, top_p=0.9,
           pad_token_id=tokenizer.eos_token_id,
       )
   return tokenizer.decode(out[0][ids.shape[-1]:], skip_special_tokens=True)