【Demo】AI-ModelScope/bert-base-uncase 模型训练及使用
如下是基于modelscope进行的bert-base-uncase 模型训练及使用样例可直接运行
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "6"from torch.utils.data import Dataset
import torch
from modelscope import AutoModelForSequenceClassification, AutoTokenizer
from modelscope.trainers import build_trainer
from modelscope.msdatasets import MsDataset
from modelscope.utils.hub import read_config
from modelscope.trainers import EpochBasedTrainer
from modelscope.metainfo import Trainers
from modelscope.utils.config import Config
from modelscope.utils.constant import ModeKeys
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import random
import numpy as np
import json
import tempfile# 随机种子
seed = 42
random.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)# 同义词库
positive_verbs = ["love", "like", "admire", "adore", "enjoy", "appreciate"]
negative_verbs = ["hate", "dislike", "despise", "loathe", "detest", "abhor"]
positive_nouns = ["Hugging Face", "this product", "the service", "the app", "this book", "the movie", "the experience"]
negative_nouns = ["this", "it", "the interface", "the design", "the concept", "the approach"]
positive_adjectives = ["fantastic", "amazing", "wonderful", "excellent", "superb", "outstanding", "brilliant"]
negative_adjectives = ["terrible", "awful", "horrible", "dreadful", "lousy", "poor"]# 生成更多样本
def generate_samples():texts = []labels = []# 生成正面样本 (50条)for _ in range(25):# 动词结构verb = random.choice(positive_verbs)noun = random.choice(positive_nouns)texts.append(f"I {verb} {noun}!")labels.append(1)# 形容词结构adj = random.choice(positive_adjectives)texts.append(f"This is {adj}!")labels.append(1)# 生成负面样本 (50条)for _ in range(25):# 动词结构verb = random.choice(negative_verbs)noun = random.choice(negative_nouns)texts.append(f"I {verb} {noun}.")labels.append(0)# 形容词结构adj = random.choice(negative_adjectives)texts.append(f"This is {adj}.")labels.append(0)return texts, labels# 原始样本
original_texts = ["I love Hugging Face!", "I hate this.", "This is fantastic!", "I dislike it."]
original_labels = [1, 0, 1, 0]# 生成新样本
new_texts, new_labels = generate_samples()# 合并样本
texts = original_texts + new_texts
labels = original_labels + new_labels# 划分训练集和验证集
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=seed
)# 加载预训练的ModelScope tokenizer
print("加载预训练的ModelScope tokenizer")
tokenizer = AutoTokenizer.from_pretrained('AI-ModelScope/bert-base-uncased')# 数据编码
train_encodings = tokenizer(train_texts,truncation=True,padding=True,max_length=128
)
val_encodings = tokenizer(val_texts,truncation=True,padding=True,max_length=128
)class SentimentDataset(Dataset):def __init__(self, encodings, labels):self.encodings = encodingsself.labels = labelsdef __getitem__(self, idx):item = {'input_ids': torch.tensor(self.encodings['input_ids'][idx]),'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),'labels': torch.tensor(self.labels[idx])}return itemdef __len__(self):return len(self.labels)# 创建数据集
print("创建数据集")
train_dataset = SentimentDataset(train_encodings, train_labels)
val_dataset = SentimentDataset(val_encodings, val_labels)# 加载预训练的ModelScope模型
print("加载预训练的ModelScope模型")
model = AutoModelForSequenceClassification.from_pretrained('AI-ModelScope/bert-base-uncased',num_labels=2 # 二分类任务
)# 修改评估函数实现
def compute_metrics(outputs, dataloader):"""正确的评估函数签名"""all_preds = []all_labels = []# 收集所有预测结果for batch_output in outputs:logits = batch_output['logits']preds = np.argmax(logits.detach().cpu().numpy(), axis=1)all_preds.extend(preds.tolist())# 收集所有真实标签for batch in dataloader:labels = batch['labels'].cpu().numpy()all_labels.extend(labels.tolist())return {'accuracy': accuracy_score(all_labels, all_preds)}# 配置训练参数
cfg_dict = {"task": "text-classification", # 添加这一行'model': {'type': 'AutoModelForSequenceClassification','model_name_or_path': 'AI-ModelScope/bert-base-uncased','num_labels': 2},'train': {'work_dir': './out_dirs','dataloader': {'batch_size_per_gpu': 2,'workers_per_gpu': 1},'optimizer': {'type': 'AdamW','lr': 5e-5},'lr_scheduler': {'type': 'LinearLR','start_factor': 1.0,'end_factor': 0.0,'total_iters': 3},'hooks': [{'type': 'CheckpointHook', 'interval': 1},{'type': 'TextLoggerHook', 'interval': 10},{'type': 'EvaluationHook', 'interval': 1}],'max_epochs': 3},'evaluation': {'dataloader': {'batch_size_per_gpu': 2,'workers_per_gpu': 1}}
}# 直接使用字典创建配置
cfg = Config(cfg_dict)# 创建临时配置文件
with open('config.json', 'w') as f:json.dump(cfg_dict, f)# 创建训练器
trainer = EpochBasedTrainer(model=model,cfg_file='config.json',train_dataset=train_dataset,eval_dataset=val_dataset,custom_eval_fn=compute_metrics # 直接传入函数
)# 开始训练
print("训练开始")
trainer.train()
print("训练结束")# 评估模型
#eval_results = trainer.evaluate(val_dataset)
eval_results = trainer.evaluate()
print(f"Accuracy: {eval_results['accuracy']:.4f}")# 保存模型
trainer.model.save_pretrained("./sentiment_model")
tokenizer.save_pretrained("./sentiment_model")# 加载模型
model = AutoModelForSequenceClassification.from_pretrained("./sentiment_model")
tokenizer = AutoTokenizer.from_pretrained("./sentiment_model")# 预测一些示例文本
example_texts = ["I love this!", "I hate it."]
inputs = tokenizer(example_texts,padding=True,truncation=True,max_length=128,return_tensors="pt"
)with torch.no_grad():outputs = model(**inputs)predicted_labels = torch.argmax(outputs.logits, dim=1).tolist()# 打印预测结果
for text, label in zip(example_texts, predicted_labels):print(f"Text: {text} -- Predicted Label: {'positive' if label == 1 else 'negative'}")