PEFT 微调

文章目录

- train
- eval

LoRA 大语言模型微调技术入门系列【编程进阶】- 使用PEFT库
https://www.bilibili.com/video/BV1fs4y1C7vD/
peft-turorial
https://github.com/ranchlai/lectures/tree/main/code/lora/peft-turorial
Efficient Large Language Model training with LoRA and Hugging Face
https://www.philschmid.de/fine-tune-flan-t5-peft

train

import argparse

import numpy as np
from datasets import concatenate_datasets, load_dataset
from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_int8_training
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
)


def get_args():
    # add arguments, with_lora
    parser = argparse.ArgumentParser()
    parser.add_argument("--with_lora", action="store_true")
    return parser.parse_args()


def train(args):

    model_id = "google/flan-t5-large"  # or large
    peft_model_id = "results_large"  # or large
    print("using lora for training large model")
    
    # if args.with_lora:
    #     model_id = "google/flan-t5-large"  # or large
    #     peft_model_id = "results_large"  # or large
    #     print("using lora for training large model")
    # else:

    #     model_id = "google/flan-t5-base"  # or large
    #     peft_model_id = "results_base"  # or large
    #     print("not using lora for training base model")

    # Load dataset from the hub
    dataset = load_dataset("samsum")
    # use only a portion of the dataset
    print(f"Train dataset size: {
      
      len(dataset['train'])}")
    print(f"Test dataset size: {
      
      len(dataset['test'])}")

    dataset["train"] = dataset["train"].select(range(0, 256))
    dataset["test"] = dataset["test"].select(range(0, 32))
    # Train dataset size: 14732
    # Test dataset size: 819
    print("using only a portion of the dataset")

    tokenizer = AutoTokenizer.from_pretrained(model_id)

    # The maximum total input sequence length after tokenization.
    # Sequences longer than this will be truncated, sequences shorter will be padded.
    tokenized_inputs = concatenate_datasets([dataset["train"], dataset["test"]]).map(
        lambda x: tokenizer(x["dialogue"], truncation=True),
        batched=True,
        remove_columns=["dialogue", "summary"],
    )
    
    input_lenghts = [len(x) for x in tokenized_inputs["input_ids"]]
    # take 85 percentile of max length for better utilization
    max_source_length = int(np.percentile(input_lenghts, 85))
    print(f"Max source length: {
      
      max_source_length}")

    # The maximum total sequence length for target text after tokenization.
    # Sequences longer than this will be truncated, sequences shorter will be padded."
    tokenized_targets = concatenate_datasets([dataset["train"], dataset["test"]]).map(
        lambda x: tokenizer(x["summary"], truncation=True),
        batched=True,
        remove_columns=["dialogue", "summary"],
    )
    target_lenghts = [len(x) for x in tokenized_targets["input_ids"]]
    # take 90 percentile of max length for better utilization
    max_target_length = int(np.percentile(target_lenghts, 90))
    print(f"Max target length: {
      
      max_target_length}")

    def preprocess_function(sample, padding="max_length"):
        # add prefix to the input for t5
        inputs = ["summarize: " + item for item in sample["dialogue"]]

        # tokenize inputs
        model_inputs = tokenizer(
            inputs, max_length=max_source_length, padding=padding, truncation=True
        )

        # Tokenize targets with the `text_target` keyword argument
        labels = tokenizer(
            text_target=sample["summary"],
            max_length=max_target_length,
            padding=padding,
            truncation=True,
        )

        # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
        # padding in the loss.
        if padding == "max_length":
            labels["input_ids"] = [
                [(l if l != tokenizer.pad_token_id else -100) for l in label]
                for label in labels["input_ids"]
            ]

        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    tokenized_dataset = dataset.map(
        preprocess_function, batched=True, remove_columns=["dialogue", "summary", "id"]
    )
    print(f"Keys of tokenized dataset: {
      
      list(tokenized_dataset['train'].features)}")

    # save datasets to disk for later easy loading
    tokenized_dataset["train"].save_to_disk("data/train")
    tokenized_dataset["test"].save_to_disk("data/eval")

    # load model from the hub
    model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

    # Define LoRA Config
    lora_config = LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=["q", "v"],
        lora_dropout=0.05,
        bias="none",
        task_type=TaskType.SEQ_2_SEQ_LM,
    )
    # prepare int-8 model for training
    # model = prepare_model_for_int8_training(model)

    # add LoRA adaptor
    if args.with_lora:
        model = get_peft_model(model, lora_config)
        model.print_trainable_parameters()

    output_dir = peft_model_id

    # trainable params: 18874368 || all params: 11154206720 || trainable%: 0.16921300163961817

    # we want to ignore tokenizer pad token in the loss
    label_pad_token_id = -100
    # Data collator
    data_collator = DataCollatorForSeq2Seq(
        tokenizer,
        model=model,
        label_pad_token_id=label_pad_token_id,
        pad_to_multiple_of=8,
        # batch_size=8,
    )

    # Define training args
    training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        auto_find_batch_size=True,
        learning_rate=3e-4,  # higher learning rate
        num_train_epochs=10,
        # per_device_train_batch_size=8,
        logging_dir=f"{
      
      output_dir}/logs",
        logging_strategy="steps",
        logging_steps=500,
        save_strategy="no",
        report_to="tensorboard",
        lr_scheduler_type="constant",
        # using only a portion of the dataset
        # train = 30,
        # max_eval_samples = 30,
    )

    # Create Trainer instance
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=tokenized_dataset["train"],
    )
    model.config.use_cache = (
        False  # silence the warnings. Please re-enable for inference!
    )
    # train model
    trainer.train()

    # Save our LoRA model & tokenizer results
    trainer.model.save_pretrained(peft_model_id)
    tokenizer.save_pretrained(peft_model_id)
    # if you want to save the base model to call
    # trainer.model.base_model.save_pretrained(peft_model_id)

if __name__ == "__main__":

    args = get_args()
    train(args)

python train.py --with_lora

python train.py

eval

# -*- coding: utf-8 -*-
import argparse
import random
from random import randrange

import evaluate
import numpy as np
import torch
from datasets import load_dataset, load_from_disk
from peft import PeftConfig, PeftModel
from tqdm import tqdm
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer


def get_args():
    # add arguments, with_lora
    parser = argparse.ArgumentParser()
    parser.add_argument("--with_lora", action="store_true")
    return parser.parse_args()


def evaluate_peft_model(sample, max_target_length=50):
    # generate summary
    outputs = model.generate(
        input_ids=sample["input_ids"].unsqueeze(0).cuda(),
        do_sample=True,
        top_p=0.9,
        max_new_tokens=max_target_length,
    )
    prediction = tokenizer.decode(
        outputs[0].detach().cpu().numpy(), skip_special_tokens=True
    )
    # decode eval sample
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(
        sample["labels"] != -100, sample["labels"], tokenizer.pad_token_id
    )
    labels = tokenizer.decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    return prediction, labels


if __name__ == "__main__":

    # fix the seed for reproducibility
    torch.manual_seed(42)
    random.seed(42)
    np.random.seed(42)

    args = get_args()

    # Load peft config for pre-trained checkpoint etc.

    if args.with_lora:
        # Load peft config for pre-trained checkpoint etc.
        peft_model_id = "results_large"
        config = PeftConfig.from_pretrained(peft_model_id)
        # load base LLM model and tokenizer
        model = AutoModelForSeq2SeqLM.from_pretrained(
            config.base_model_name_or_path, load_in_8bit=True, device_map={
    
    "": 0}
        )
        tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
        # Load the Lora model
        model = PeftModel.from_pretrained(model, peft_model_id, device_map={
    
    "": 0})
    else:
        base_model_name_or_path = "results_base"
        # load base LLM model and tokenizer
        model = AutoModelForSeq2SeqLM.from_pretrained(
            base_model_name_or_path, device_map={
    
    "": 0}
        )
        tokenizer = AutoTokenizer.from_pretrained(base_model_name_or_path)

    # Load the Lora model
    model.eval()
    # Load dataset from the hub and get a sample
    dataset = load_dataset("samsum")
    sample = dataset["test"][randrange(len(dataset["test"]))]

    input_ids = tokenizer(
        sample["dialogue"], return_tensors="pt", truncation=True
    ).input_ids.cuda()
    # with torch.inference_mode():
    outputs = model.generate(
        input_ids=input_ids, max_new_tokens=10, do_sample=True, top_p=0.9
    )
    print(f"input sentence: {
      
      sample['dialogue']}\n{
      
      '---'* 20}")

    print(
        f"summary:\n{
      
      tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]}"
    )

    # Metric
    metric = evaluate.load("rouge")

    # load test dataset from distk
    test_dataset = load_from_disk("data/eval/").with_format("torch")

    # run predictions
    # this can take ~45 minutes
    predictions, references = [], []
    for sample in tqdm(test_dataset):
        p, l = evaluate_peft_model(sample)
        predictions.append(p)
        references.append(l)

    # compute metric
    rogue = metric.compute(
        predictions=predictions, references=references, use_stemmer=True
    )

    # print results
    print(f"Rogue1: {
      
      rogue['rouge1']* 100:2f}%")
    print(f"rouge2: {
      
      rogue['rouge2']* 100:2f}%")
    print(f"rougeL: {
      
      rogue['rougeL']* 100:2f}%")
    print(f"rougeLsum: {
      
      rogue['rougeLsum']* 100:2f}%")

    # Rogue1: 50.386161%
    # rouge2: 24.842412%
    # rougeL: 41.370130%
    # rougeLsum: 41.394230%

python train.py --with_lora

伊织 2023-09-14

文章目录

train

eval

猜你喜欢