• bitcoinBitcoin(BTC)$77,542.001.02%
  • ethereumEthereum(ETH)$2,131.330.80%
  • tetherTether(USDT)$1.00-0.02%
  • binancecoinBNB(BNB)$643.920.66%
  • rippleXRP(XRP)$1.37-0.24%
  • usd-coinUSDC(USDC)$1.000.00%
  • solanaSolana(SOL)$84.940.45%
  • tronTRON(TRX)$0.3574690.70%
  • Figure HelocFigure Heloc(FIGR_HELOC)$1.03-0.32%
  • dogecoinDogecoin(DOGE)$0.103802-0.21%
  • whitebitWhiteBIT Coin(WBT)$57.201.12%
  • HyperliquidHyperliquid(HYPE)$49.292.71%
  • USDSUSDS(USDS)$1.000.01%
  • zcashZcash(ZEC)$587.104.67%
  • cardanoCardano(ADA)$0.249609-0.06%
  • leo-tokenLEO Token(LEO)$10.01-0.65%
  • bitcoin-cashBitcoin Cash(BCH)$369.03-2.08%
  • moneroMonero(XMR)$396.392.43%
  • chainlinkChainlink(LINK)$9.58-0.01%
  • CantonCanton(CC)$0.149290-0.52%
  • the-open-networkToncoin(TON)$1.94-2.78%
  • stellarStellar(XLM)$0.142739-1.95%
  • USD1USD1(USD1)$1.000.04%
  • Ethena USDeEthena USDe(USDE)$1.000.00%
  • MemeCoreMemeCore(M)$3.35-3.56%
  • daiDai(DAI)$1.000.02%
  • suiSui(SUI)$1.06-0.80%
  • litecoinLitecoin(LTC)$54.11-0.05%
  • avalanche-2Avalanche(AVAX)$9.210.46%
  • hedera-hashgraphHedera(HBAR)$0.089092-0.71%
  • RainRain(RAIN)$0.0074861.84%
  • paypal-usdPayPal USD(PYUSD)$1.000.00%
  • shiba-inuShiba Inu(SHIB)$0.0000060.12%
  • crypto-com-chainCronos(CRO)$0.068774-0.70%
  • Circle USYCCircle USYC(USYC)$1.120.00%
  • Global DollarGlobal Dollar(USDG)$1.000.01%
  • tether-goldTether Gold(XAUT)$4,487.70-1.09%
  • BlackRock USD Institutional Digital Liquidity FundBlackRock USD Institutional Digital Liquidity Fund(BUIDL)$1.000.00%
  • BittensorBittensor(TAO)$262.441.38%
  • uniswapUniswap(UNI)$3.654.35%
  • Ondo US Dollar YieldOndo US Dollar Yield(USDY)$1.13-0.13%
  • nearNEAR Protocol(NEAR)$1.641.32%
  • pax-goldPAX Gold(PAXG)$4,489.26-1.03%
  • polkadotPolkadot(DOT)$1.240.37%
  • mantleMantle(MNT)$0.63-0.16%
  • World Liberty FinancialWorld Liberty Financial(WLFI)$0.0618013.18%
  • OndoOndo(ONDO)$0.375225-0.19%
  • HTX DAOHTX DAO(HTX)$0.0000020.48%
  • Falcon USDFalcon USD(USDF)$1.000.01%
  • AsterAster(ASTER)$0.661.22%
TradePoint.io
  • Main
  • AI & Technology
  • Stock Charts
  • Market & News
  • Business
  • Finance Tips
  • Trade Tube
  • Blog
  • Shop
No Result
View All Result
TradePoint.io
No Result
View All Result

A Coding Implementation to Compress and Benchmark Instruction-Tuned LLMs with FP8, GPTQ, and SmoothQuant Quantization using llmcompressor

May 17, 2026
in AI & Technology
Reading Time: 2 mins read
A A
A Coding Implementation to Compress and Benchmark Instruction-Tuned LLMs with FP8, GPTQ, and SmoothQuant Quantization using llmcompressor
ShareShareShareShareShare

YOU MAY ALSO LIKE

Alibaba Qwen Team Introduces Qwen3.5-LiveTranslate-Flash: Real-Time Multimodal Interpretation Across 60 Languages at 2.8-Second Latency

SAP Unveils Automation Suite Amid Software Market Doubts

import subprocess, sys
def pip(*pkgs):
   subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", *pkgs])
pip("llmcompressor", "compressed-tensors",
   "transformers>=4.45", "accelerate", "datasets")
import os, gc, time, json, math
from pathlib import Path
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
assert torch.cuda.is_available(), \
   "Enable a GPU: Runtime > Change runtime type > T4 GPU"
print("GPU:", torch.cuda.get_device_name(0),
     "| CUDA:", torch.version.cuda,
     "| torch:", torch.__version__)
MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
WORKDIR = Path("/content/quant_lab"); WORKDIR.mkdir(exist_ok=True)
os.chdir(WORKDIR)
def free_mem():
   gc.collect(); torch.cuda.empty_cache()
def dir_size_gb(path):
   total = 0
   for root, _, files in os.walk(path):
       for f in files:
           total += os.path.getsize(os.path.join(root, f))
   return total / 1e9
def time_generation(model, tok, prompt, max_new_tokens=64):
   """Greedy decode; reports latency & tokens/sec after a brief warmup."""
   inputs = tok(prompt, return_tensors="pt").to(model.device)
   _ = model.generate(**inputs, max_new_tokens=4, do_sample=False)
   torch.cuda.synchronize()
   t0 = time.time()
   out = model.generate(**inputs, max_new_tokens=max_new_tokens,
                        do_sample=False, pad_token_id=tok.eos_token_id)
   torch.cuda.synchronize()
   dt = time.time() - t0
   new_ids = out[0][inputs["input_ids"].shape[1]:]
   return tok.decode(new_ids, skip_special_tokens=True), dt, max_new_tokens/dt
@torch.no_grad()
def wikitext_ppl(model, tok, seq_len=512, max_chunks=20, stride=512):
   """Light WikiText-2 perplexity probe (fast, indicative)."""
   ds = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
   text = "\n\n".join(t for t in ds["text"][:400] if t.strip())
   enc = tok(text, return_tensors="pt").input_ids.to(model.device)
   nll_sum, tok_count = 0.0, 0
   for begin in range(0, enc.size(1) - seq_len, stride):
       chunk = enc[:, begin:begin+seq_len]
       out = model(chunk, labels=chunk)
       nll_sum += out.loss.float().item() * seq_len
       tok_count += seq_len
       if tok_count // seq_len >= max_chunks: break
   return math.exp(nll_sum / tok_count)
results = {}
PROMPT = ("<|im_start|>user\nIn two sentences, explain why post-training "
         "quantization works for large language models.<|im_end|>\n"
         "<|im_start|>assistant\n")
def benchmark(label, model_path_or_id):
   free_mem()
   print(f"\n──── benchmarking: {label} ────")
   tok = AutoTokenizer.from_pretrained(model_path_or_id)
   m = AutoModelForCausalLM.from_pretrained(
           model_path_or_id, torch_dtype="auto", device_map="cuda").eval()
   sample, dt, tps = time_generation(m, tok, PROMPT)
   ppl = wikitext_ppl(m, tok)
   size = dir_size_gb(model_path_or_id) if os.path.isdir(str(model_path_or_id)) else None
   results[label] = {"size_gb": size, "ppl": round(ppl, 3),
                     "latency_s": round(dt, 3), "tok_per_s": round(tps, 1),
                     "sample": sample.strip().replace("\n", " ")[:180]}
   print(json.dumps(results[label], indent=2))
   del m; free_mem()

Credit: Source link

ShareTweetSendSharePin

Related Posts

Alibaba Qwen Team Introduces Qwen3.5-LiveTranslate-Flash: Real-Time Multimodal Interpretation Across 60 Languages at 2.8-Second Latency
AI & Technology

Alibaba Qwen Team Introduces Qwen3.5-LiveTranslate-Flash: Real-Time Multimodal Interpretation Across 60 Languages at 2.8-Second Latency

May 20, 2026
SAP Unveils Automation Suite Amid Software Market Doubts
AI & Technology

SAP Unveils Automation Suite Amid Software Market Doubts

May 20, 2026
A-Star: Small Bets Still Crucial for VC-Style Returns
AI & Technology

A-Star: Small Bets Still Crucial for VC-Style Returns

May 20, 2026
CME Plans Computing Power Futures Market
AI & Technology

CME Plans Computing Power Futures Market

May 20, 2026
Next Post
Meta to lay off 10% of company as Microsoft offers buyouts

Meta to lay off 10% of company as Microsoft offers buyouts

Leave a Reply Cancel reply

Your email address will not be published. Required fields are marked *

Search

No Result
View All Result
Unfounded conspiracies swirl online after shooting at White House Correspondents’ Dinner

Unfounded conspiracies swirl online after shooting at White House Correspondents’ Dinner

May 13, 2026
Kalshi suspends three politicians from platform

Kalshi suspends three politicians from platform

May 19, 2026
AI-powered bank founded by Peter Thiel protégé wants to replace humans with code — will it work?

AI-powered bank founded by Peter Thiel protégé wants to replace humans with code — will it work?

May 14, 2026

About

Learn more

Our Services

Legal

Privacy Policy

Terms of Use

Bloggers

Learn more

Article Links

Contact

Advertise

Ask us anything

©2020- TradePoint.io - All rights reserved!

Tradepoint.io, being just a publishing and technology platform, is not a registered broker-dealer or investment adviser. So we do not provide investment advice. Rather, brokerage services are provided to clients of Tradepoint.io by independent SEC-registered broker-dealers and members of FINRA/SIPC. Every form of investing carries some risk and past performance is not a guarantee of future results. “Tradepoint.io“, “Instant Investing” and “My Trading Tools” are registered trademarks of Apperbuild, LLC.

This website is operated by Apperbuild, LLC. We have no link to any brokerage firm and we do not provide investment advice. Every information and resource we provide is solely for the education of our readers. © 2020 Apperbuild, LLC. All rights reserved.

No Result
View All Result
  • Main
  • AI & Technology
  • Stock Charts
  • Market & News
  • Business
  • Finance Tips
  • Trade Tube
  • Blog
  • Shop

© 2023 - TradePoint.io - All Rights Reserved!