Monday, 1 June 2026

how to like if ek document fetch and then usse sematic search document

 AISS + BM25 hybrid retrieval ka idea ye hai:

  1. BM25 exact keyword matching karega.
  2. FAISS semantic similarity (embeddings) se retrieve karega.
  3. Dono ke scores combine karoge.
  4. Top results ko optionally reranker ko doge.

Install

pip install faiss-cpu rank-bm25 sentence-transformers numpy

Step 1: Documents

docs = [
"Transformers use self attention mechanism.",
"BERT is a bidirectional transformer model.",
"CNNs are commonly used for image classification.",
"Attention improves long range dependency modeling.",
"Vision Transformers achieve strong image recognition results."
]

Step 2: BM25 Index

from rank_bm25 import BM25Okapi

tokenized_docs = [doc.lower().split() for doc in docs]

bm25 = BM25Okapi(tokenized_docs)

Step 3: FAISS Index

from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

model = SentenceTransformer(
"sentence-transformers/all-MiniLM-L6-v2"
)

doc_embeddings = model.encode(
docs,
convert_to_numpy=True
)

dimension = doc_embeddings.shape[1]

faiss_index = faiss.IndexFlatL2(dimension)
faiss_index.add(doc_embeddings)

Step 4: Hybrid Retrieval

def hybrid_search(query, top_k=3):

# ---------- BM25 ----------
bm25_scores = bm25.get_scores(
query.lower().split()
)

# normalize
bm25_scores = (
bm25_scores - bm25_scores.min()
) / (
bm25_scores.max() - bm25_scores.min() + 1e-8
)

# ---------- FAISS ----------
query_embedding = model.encode(
[query],
convert_to_numpy=True
)

distances, indices = faiss_index.search(
query_embedding,
len(docs)
)

faiss_scores = np.zeros(len(docs))

for rank, idx in enumerate(indices[0]):
similarity = 1 / (1 + distances[0][rank])
faiss_scores[idx] = similarity

faiss_scores = (
faiss_scores - faiss_scores.min()
) / (
faiss_scores.max() - faiss_scores.min() + 1e-8
)

# ---------- Hybrid Score ----------
alpha = 0.5

hybrid_scores = (
alpha * bm25_scores
+
(1 - alpha) * faiss_scores
)

ranked_indices = np.argsort(
hybrid_scores
)[::-1][:top_k]

results = []

for idx in ranked_indices:
results.append({
"document": docs[idx],
"hybrid_score": float(hybrid_scores[idx]),
"bm25_score": float(bm25_scores[idx]),
"vector_score": float(faiss_scores[idx])
})

return results

Step 5: Query

results = hybrid_search(
"transformer attention"
)

for r in results:
print(r)

Output roughly:

{
'document': 'Transformers use self attention mechanism.',
'hybrid_score': 0.96
}

{
'document': 'BERT is a bidirectional transformer model.',
'hybrid_score': 0.84
}

{
'document': 'Attention improves long range dependency modeling.',
'hybrid_score': 0.81
}

Production Version

Industry mein generally:

Query

BM25 Top 50
+
Vector Search Top 50

Merge

Cross Encoder Reranker

Top 5

LLM

Example reranker:

from sentence_transformers import CrossEncoder

reranker = CrossEncoder(
"cross-encoder/ms-marco-MiniLM-L-6-v2"
)

pairs = [
[query, doc]
for doc in candidate_docs
]

scores = reranker.predict(pairs)

Ye FAISS + BM25 se aaye 50-100 candidates ko re-rank karta hai aur retrieval quality kaafi improve ho jati hai.

Research-paper RAG ke liye agar tum LangChain use kar rahe ho, to main FAISS + BM25 + CrossEncoder reranking ka complete end-to-end pipeline bhi dikha sakta hoon jo arXiv PDFs par directly kaam kare.




ha de example code full uska




Agar tum arXiv papers → chunking → FAISS + BM25 → CrossEncoder reranking → LLM banana chahte ho, to ye ek realistic end-to-end example hai.

Install

pip install pymupdf
pip install sentence-transformers
pip install faiss-cpu
pip install rank-bm25
pip install langchain
pip install langchain-community
pip install arxiv

1. Download Paper from arXiv

import arxiv

search = arxiv.Search(
query="Retrieval Augmented Generation",
max_results=1
)

paper = next(search.results())

pdf_url = paper.pdf_url

print(pdf_url)

2. Extract PDF Text

import requests
import fitz

pdf_path = "paper.pdf"

response = requests.get(pdf_url)

with open(pdf_path, "wb") as f:
f.write(response.content)

doc = fitz.open(pdf_path)

text = ""

for page in doc:
text += page.get_text()

print(text[:1000])

3. Chunking

from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200
)

chunks = splitter.split_text(text)

print("Chunks:", len(chunks))

4. Create Embeddings

from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer(
"BAAI/bge-small-en-v1.5"
)

embeddings = embedding_model.encode(
chunks,
convert_to_numpy=True,
show_progress_bar=True
)

5. Build FAISS Index

import faiss
import numpy as np

dimension = embeddings.shape[1]

faiss_index = faiss.IndexFlatIP(dimension)

faiss.normalize_L2(embeddings)

faiss_index.add(embeddings)

print("Indexed:", faiss_index.ntotal)

6. Build BM25 Index

from rank_bm25 import BM25Okapi

tokenized_chunks = [
chunk.lower().split()
for chunk in chunks
]

bm25 = BM25Okapi(tokenized_chunks)

7. Hybrid Retrieval

def hybrid_retrieve(
query,
faiss_top_k=20,
bm25_top_k=20
):

# -----------------
# BM25
# -----------------

bm25_scores = bm25.get_scores(
query.lower().split()
)

bm25_top_ids = np.argsort(
bm25_scores
)[::-1][:bm25_top_k]

# -----------------
# FAISS
# -----------------

query_embedding = embedding_model.encode(
[query],
convert_to_numpy=True
)

faiss.normalize_L2(query_embedding)

distances, indices = faiss_index.search(
query_embedding,
faiss_top_k
)

# -----------------
# Merge Candidates
# -----------------

candidates = set(
bm25_top_ids.tolist()
)

candidates.update(
indices[0].tolist()
)

return list(candidates)

8. Cross Encoder Reranker

Ye sabse important part hai.

from sentence_transformers import CrossEncoder

reranker = CrossEncoder(
"cross-encoder/ms-marco-MiniLM-L-6-v2"
)

9. Retrieve + Rerank

def retrieve_and_rerank(
query,
top_k=5
):

candidate_ids = hybrid_retrieve(query)

candidate_chunks = [
chunks[i]
for i in candidate_ids
]

pairs = [
[query, chunk]
for chunk in candidate_chunks
]

scores = reranker.predict(pairs)

ranked = sorted(
zip(candidate_chunks, scores),
key=lambda x: x[1],
reverse=True
)

return ranked[:top_k]

10. Test Retrieval

results = retrieve_and_rerank(
"How does retrieval augmented generation work?"
)

for idx, (chunk, score) in enumerate(results):

print("=" * 80)
print("Rank:", idx + 1)
print("Score:", score)
print(chunk[:1000])

11. Pass Context to LLM

query = "How does retrieval augmented generation work?"

retrieved_docs = retrieve_and_rerank(
query,
top_k=3
)

context = "\n\n".join(
doc
for doc, score in retrieved_docs
)

Prompt:

prompt = f"""
Answer only from the provided context.

Context:
{context}

Question:
{query}
"""

Then send prompt to Gemini/OpenAI/Llama.


Better Version for Research Papers

Research papers ke liye main ye architecture use karta:

arXiv PDF

PyMuPDF

Section-aware chunking

BGE Embeddings

FAISS

BM25

Candidate Merge

CrossEncoder Reranker

Top 5 Chunks

LLM

Ye simple vector search se kaafi better retrieval deta hai, especially jab paper mein terms jaise "RAG", "Retrieval-Augmented Generation", "retriever", "dense retrieval" alag-alag jagah likhe ho. BM25 exact keywords pakad leta hai aur FAISS semantic similarity. CrossEncoder final ranking improve karta hai.

Saturday, 30 May 2026

added second repo in current repo as folder

 Agar tum us repo ko bas ek normal folder ki tarah rakhna chahte ho aur uske saare files parent repo ke saath GitHub par push karna chahte ho, toh Option 2 sahi hai.

Steps

  1. Inner repository ki .git directory hatao:
rm -rf lecture_16_langsmith/langsmith-masterclass/.git -> ye uss repo ka name leture_!6_langsmith wale folder mein that angsmithmaster class wali repo
  1. Agar pehle se add ho chuka hai toh cache saaf karo:
git rm --cached -r lecture_16_langsmith/langsmith-masterclass

sql

 WITH filtered AS (

    SELECT *

    FROM Kurtis

    WHERE stock_quantity > 20

      AND color IN ('Red', 'Green')

      AND date_added BETWEEN DATE '2024-08-01' AND DATE '2024-11-01'

),

ranked AS (

    SELECT *,

           DENSE_RANK() OVER (

               PARTITION BY brand, size

               ORDER BY price DESC

           ) AS rnk

    FROM filtered

),

selected AS (

    SELECT *

    FROM ranked

    WHERE rnk = 1

),

stock_totals AS (

    SELECT brand,

           size,

           SUM(stock_quantity) AS total_stock

    FROM selected

    GROUP BY brand, size

)

SELECT

    s.brand,

    s.size,

    s.color,

    s.price,

    t.total_stock

FROM selected s

JOIN stock_totals t

  ON s.brand = t.brand

 AND s.size = t.size

ORDER BY

    s.brand,

    t.total_stock DESC,

    s.price DESC;




















SELECT

    k.brand,

    k.size,

    k.color,

    k.price,

    s.total_stock

FROM Kurtis k

JOIN (

    SELECT

        brand,

        size,

        SUM(stock_quantity) AS total_stock

    FROM Kurtis

    WHERE stock_quantity > 20

      AND color IN ('Red', 'Green')

      AND date_added BETWEEN '2024-08-01' AND '2024-11-01'

    GROUP BY brand, size

) s

ON k.brand = s.brand

AND k.size = s.size

WHERE k.stock_quantity > 20

  AND k.color IN ('Red', 'Green')

  AND k.date_added BETWEEN '2024-08-01' AND '2024-11-01'

  AND k.price = (

      SELECT MAX(price)

      FROM Kurtis k2

      WHERE k2.brand = k.brand

        AND k2.size = k.size

        AND k2.stock_quantity > 20

        AND k2.color IN ('Red', 'Green')

        AND k2.date_added BETWEEN '2024-08-01' AND '2024-11-01'

  )

ORDER BY k.brand, s.total_stock DESC, k.price DESC;

Sunday, 24 May 2026

langchain mein if chabot bana ho but like uska code invoke na support

 from transformers import AutoTokenizer, AutoModelForCausalLM

from langchain_core.runnables import RunnableLambda


model_name = "Qwen/Qwen3-0.6B"


tokenizer = AutoTokenizer.from_pretrained(model_name)


model = AutoModelForCausalLM.from_pretrained(

    model_name,

    device_map="auto"

)


def generate_response(user_input):


    messages = [

        {"role": "user", "content": user_input}

    ]


    text = tokenizer.apply_chat_template(

        messages,

        tokenize=False,

        add_generation_prompt=True,

        enable_thinking=True

    )


    inputs = tokenizer(text, return_tensors="pt").to(model.device)


    outputs = model.generate(

        **inputs,

        max_new_tokens=200

    )


    response = tokenizer.decode(

        outputs[0][inputs.input_ids.shape[1]:],

        skip_special_tokens=True

    )


    return response


llm = RunnableLambda(generate_response)


print(llm.invoke("Who are Eldians in AOT?"))

Friday, 15 May 2026

Wednesday, 13 May 2026

3rd year chutti

 1. project -: 

2. leetcode- legularly 

3. gen ai -regularly 

4. cs funcdamental 

5. english speaking : regularly 

6. deep learning and ml ki practice -: 

Hello

how to like if ek document fetch and then usse sematic search document

 AISS + BM25 hybrid retrieval ka idea ye hai: BM25 exact keyword matching karega. FAISS semantic similarity (embeddings) se retrieve ...