Time. : legal_bertmodel_pretraining how to do full pipline

import torch

import torch.nn as nn

from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer, AutoModel

# -------------------------

# Dummy Dataset

# -------------------------

texts = [

"This employment agreement is between company and employee.",

"The court dismissed the appeal.",

"Employee shall maintain confidentiality.",

"The judge granted bail to the accused."

]

# 0 = Contract

# 1 = Court Case

labels = [0, 1, 0, 1]

# -------------------------

# Tokenizer

# -------------------------

MODEL_NAME = "nlpaueb/legal-bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

class LegalDataset(Dataset):

def __init__(self, texts, labels):

self.texts = texts

self.labels = labels

def __len__(self):

return len(self.texts)

def __getitem__(self, idx):

enc = tokenizer(

self.texts[idx],

truncation=True,

padding="max_length",

max_length=128,

return_tensors="pt"

)

return {

"input_ids": enc["input_ids"].squeeze(),

"attention_mask": enc["attention_mask"].squeeze(),

"label": torch.tensor(self.labels[idx])

}

dataset = LegalDataset(texts, labels)

loader = DataLoader(

dataset,

batch_size=2,

shuffle=True

)

# -------------------------

# Custom Model

# -------------------------

class LegalClassifier(nn.Module):

def __init__(self):

super().__init__()

self.bert = AutoModel.from_pretrained(

MODEL_NAME

)

self.classifier = nn.Sequential(

nn.Linear(768, 512),

nn.ReLU(),

nn.Dropout(0.2),

nn.Linear(512, 128),

nn.ReLU(),

nn.Linear(128, 2)

)

def forward(

self,

input_ids,

attention_mask

outputs = self.bert(

input_ids=input_ids,

attention_mask=attention_mask

)

cls_embedding = outputs.last_hidden_state[:, 0, :]

logits = self.classifier(

cls_embedding

)

return logits

model = LegalClassifier()

# -------------------------

# Training

# -------------------------

optimizer = torch.optim.AdamW(

model.parameters(),

lr=2e-5

)

criterion = nn.CrossEntropyLoss()

EPOCHS = 3

model.train()

for epoch in range(EPOCHS):

total_loss = 0

for batch in loader:

optimizer.zero_grad()

logits = model(

batch["input_ids"],

batch["attention_mask"]

)

loss = criterion(

logits,

batch["label"]

)

loss.backward()

optimizer.step()

total_loss += loss.item()

print(

f"Epoch {epoch+1} Loss = {total_loss:.4f}"

)

# -------------------------

# Prediction

# -------------------------

model.eval()

test_text = "This agreement shall remain confidential."

enc = tokenizer(

test_text,

return_tensors="pt",

truncation=True,

padding=True

)

with torch.no_grad():

logits = model(

enc["input_ids"],

enc["attention_mask"]

)

pred = torch.argmax(

logits,

dim=1

)

print("Prediction:", pred.item())

Agar tu BERT freeze karna chahe:


for p in self.bert.parameters():
    p.requires_grad = False 
init main likh de isko

Time.

Pages

Friday, 19 June 2026

legal_bertmodel_pretraining how to do full pipline

No comments:

Post a Comment

Hello

legal_bertmodel_pretraining how to do full pipline

Report Abuse

Labels