• bitcoinBitcoin(BTC)$77,421.000.61%
  • ethereumEthereum(ETH)$2,128.340.54%
  • tetherTether(USDT)$1.00-0.02%
  • binancecoinBNB(BNB)$643.130.48%
  • rippleXRP(XRP)$1.37-0.37%
  • usd-coinUSDC(USDC)$1.000.00%
  • solanaSolana(SOL)$84.920.28%
  • tronTRON(TRX)$0.3578140.78%
  • Figure HelocFigure Heloc(FIGR_HELOC)$1.03-0.32%
  • dogecoinDogecoin(DOGE)$0.103833-0.34%
  • whitebitWhiteBIT Coin(WBT)$57.150.76%
  • HyperliquidHyperliquid(HYPE)$49.812.61%
  • USDSUSDS(USDS)$1.000.00%
  • zcashZcash(ZEC)$582.854.39%
  • cardanoCardano(ADA)$0.2497480.06%
  • leo-tokenLEO Token(LEO)$10.01-0.72%
  • bitcoin-cashBitcoin Cash(BCH)$368.11-1.51%
  • moneroMonero(XMR)$398.451.86%
  • chainlinkChainlink(LINK)$9.59-0.14%
  • CantonCanton(CC)$0.149253-0.66%
  • the-open-networkToncoin(TON)$1.94-2.63%
  • stellarStellar(XLM)$0.143057-1.69%
  • USD1USD1(USD1)$1.000.02%
  • Ethena USDeEthena USDe(USDE)$1.00-0.01%
  • daiDai(DAI)$1.00-0.02%
  • MemeCoreMemeCore(M)$3.32-4.19%
  • suiSui(SUI)$1.06-0.38%
  • litecoinLitecoin(LTC)$53.94-0.50%
  • avalanche-2Avalanche(AVAX)$9.220.85%
  • hedera-hashgraphHedera(HBAR)$0.088964-0.71%
  • RainRain(RAIN)$0.0075072.14%
  • paypal-usdPayPal USD(PYUSD)$1.000.06%
  • shiba-inuShiba Inu(SHIB)$0.0000060.27%
  • crypto-com-chainCronos(CRO)$0.068662-0.78%
  • Circle USYCCircle USYC(USYC)$1.120.00%
  • Global DollarGlobal Dollar(USDG)$1.00-0.01%
  • tether-goldTether Gold(XAUT)$4,491.54-1.00%
  • BlackRock USD Institutional Digital Liquidity FundBlackRock USD Institutional Digital Liquidity Fund(BUIDL)$1.000.00%
  • BittensorBittensor(TAO)$263.451.61%
  • uniswapUniswap(UNI)$3.603.14%
  • nearNEAR Protocol(NEAR)$1.652.37%
  • Ondo US Dollar YieldOndo US Dollar Yield(USDY)$1.130.04%
  • pax-goldPAX Gold(PAXG)$4,493.16-0.96%
  • polkadotPolkadot(DOT)$1.240.39%
  • mantleMantle(MNT)$0.63-0.22%
  • World Liberty FinancialWorld Liberty Financial(WLFI)$0.0614942.61%
  • OndoOndo(ONDO)$0.3818682.08%
  • HTX DAOHTX DAO(HTX)$0.0000020.42%
  • Falcon USDFalcon USD(USDF)$1.000.00%
  • AsterAster(ASTER)$0.661.22%
TradePoint.io
  • Main
  • AI & Technology
  • Stock Charts
  • Market & News
  • Business
  • Finance Tips
  • Trade Tube
  • Blog
  • Shop
No Result
View All Result
TradePoint.io
No Result
View All Result

A Coding Implementation to Compress and Benchmark Instruction-Tuned LLMs with FP8, GPTQ, and SmoothQuant Quantization using llmcompressor

May 17, 2026
in AI & Technology
Reading Time: 2 mins read
A A
A Coding Implementation to Compress and Benchmark Instruction-Tuned LLMs with FP8, GPTQ, and SmoothQuant Quantization using llmcompressor
ShareShareShareShareShare

YOU MAY ALSO LIKE

NVIDIA AI Releases Nemotron-Labs-Diffusion: A Tri-Mode Language Model with 6× Tokens Per Forward Over Qwen3-8B

Mercedes-AMG’s 1,153 Hp Electric GT 4-Door Takes On Porsche’s Taycan

import subprocess, sys
def pip(*pkgs):
   subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", *pkgs])
pip("llmcompressor", "compressed-tensors",
   "transformers>=4.45", "accelerate", "datasets")
import os, gc, time, json, math
from pathlib import Path
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
assert torch.cuda.is_available(), \
   "Enable a GPU: Runtime > Change runtime type > T4 GPU"
print("GPU:", torch.cuda.get_device_name(0),
     "| CUDA:", torch.version.cuda,
     "| torch:", torch.__version__)
MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
WORKDIR = Path("/content/quant_lab"); WORKDIR.mkdir(exist_ok=True)
os.chdir(WORKDIR)
def free_mem():
   gc.collect(); torch.cuda.empty_cache()
def dir_size_gb(path):
   total = 0
   for root, _, files in os.walk(path):
       for f in files:
           total += os.path.getsize(os.path.join(root, f))
   return total / 1e9
def time_generation(model, tok, prompt, max_new_tokens=64):
   """Greedy decode; reports latency & tokens/sec after a brief warmup."""
   inputs = tok(prompt, return_tensors="pt").to(model.device)
   _ = model.generate(**inputs, max_new_tokens=4, do_sample=False)
   torch.cuda.synchronize()
   t0 = time.time()
   out = model.generate(**inputs, max_new_tokens=max_new_tokens,
                        do_sample=False, pad_token_id=tok.eos_token_id)
   torch.cuda.synchronize()
   dt = time.time() - t0
   new_ids = out[0][inputs["input_ids"].shape[1]:]
   return tok.decode(new_ids, skip_special_tokens=True), dt, max_new_tokens/dt
@torch.no_grad()
def wikitext_ppl(model, tok, seq_len=512, max_chunks=20, stride=512):
   """Light WikiText-2 perplexity probe (fast, indicative)."""
   ds = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
   text = "\n\n".join(t for t in ds["text"][:400] if t.strip())
   enc = tok(text, return_tensors="pt").input_ids.to(model.device)
   nll_sum, tok_count = 0.0, 0
   for begin in range(0, enc.size(1) - seq_len, stride):
       chunk = enc[:, begin:begin+seq_len]
       out = model(chunk, labels=chunk)
       nll_sum += out.loss.float().item() * seq_len
       tok_count += seq_len
       if tok_count // seq_len >= max_chunks: break
   return math.exp(nll_sum / tok_count)
results = {}
PROMPT = ("<|im_start|>user\nIn two sentences, explain why post-training "
         "quantization works for large language models.<|im_end|>\n"
         "<|im_start|>assistant\n")
def benchmark(label, model_path_or_id):
   free_mem()
   print(f"\n──── benchmarking: {label} ────")
   tok = AutoTokenizer.from_pretrained(model_path_or_id)
   m = AutoModelForCausalLM.from_pretrained(
           model_path_or_id, torch_dtype="auto", device_map="cuda").eval()
   sample, dt, tps = time_generation(m, tok, PROMPT)
   ppl = wikitext_ppl(m, tok)
   size = dir_size_gb(model_path_or_id) if os.path.isdir(str(model_path_or_id)) else None
   results[label] = {"size_gb": size, "ppl": round(ppl, 3),
                     "latency_s": round(dt, 3), "tok_per_s": round(tps, 1),
                     "sample": sample.strip().replace("\n", " ")[:180]}
   print(json.dumps(results[label], indent=2))
   del m; free_mem()

Credit: Source link

ShareTweetSendSharePin

Related Posts

NVIDIA AI Releases Nemotron-Labs-Diffusion: A Tri-Mode Language Model with 6× Tokens Per Forward Over Qwen3-8B
AI & Technology

NVIDIA AI Releases Nemotron-Labs-Diffusion: A Tri-Mode Language Model with 6× Tokens Per Forward Over Qwen3-8B

May 20, 2026
Mercedes-AMG’s 1,153 Hp Electric GT 4-Door Takes On Porsche’s Taycan
AI & Technology

Mercedes-AMG’s 1,153 Hp Electric GT 4-Door Takes On Porsche’s Taycan

May 20, 2026
Alibaba Qwen Team Introduces Qwen3.5-LiveTranslate-Flash: Real-Time Multimodal Interpretation Across 60 Languages at 2.8-Second Latency
AI & Technology

Alibaba Qwen Team Introduces Qwen3.5-LiveTranslate-Flash: Real-Time Multimodal Interpretation Across 60 Languages at 2.8-Second Latency

May 20, 2026
SAP Unveils Automation Suite Amid Software Market Doubts
AI & Technology

SAP Unveils Automation Suite Amid Software Market Doubts

May 20, 2026
Next Post
Meta to lay off 10% of company as Microsoft offers buyouts

Meta to lay off 10% of company as Microsoft offers buyouts

Leave a Reply Cancel reply

Your email address will not be published. Required fields are marked *

Search

No Result
View All Result
True crime podcast helps crack cold case

True crime podcast helps crack cold case

May 13, 2026
Finding common ground in America

Finding common ground in America

May 18, 2026
Justice Department drops criminal investigation into Jerome Powell

Justice Department drops criminal investigation into Jerome Powell

May 17, 2026

About

Learn more

Our Services

Legal

Privacy Policy

Terms of Use

Bloggers

Learn more

Article Links

Contact

Advertise

Ask us anything

©2020- TradePoint.io - All rights reserved!

Tradepoint.io, being just a publishing and technology platform, is not a registered broker-dealer or investment adviser. So we do not provide investment advice. Rather, brokerage services are provided to clients of Tradepoint.io by independent SEC-registered broker-dealers and members of FINRA/SIPC. Every form of investing carries some risk and past performance is not a guarantee of future results. “Tradepoint.io“, “Instant Investing” and “My Trading Tools” are registered trademarks of Apperbuild, LLC.

This website is operated by Apperbuild, LLC. We have no link to any brokerage firm and we do not provide investment advice. Every information and resource we provide is solely for the education of our readers. © 2020 Apperbuild, LLC. All rights reserved.

No Result
View All Result
  • Main
  • AI & Technology
  • Stock Charts
  • Market & News
  • Business
  • Finance Tips
  • Trade Tube
  • Blog
  • Shop

© 2023 - TradePoint.io - All Rights Reserved!