ls-gpt2-demo

A demo of GPT2 model trained and infered with LightSeq

Stars
7

A100

T4500:

fp16

fp16pretrain

train.txt500

python3 -m torch.distributed.launch \
    --nproc_per_node=8 \
    train.py \
    --model_name_or_path uer/gpt2-chinese-cluecorpussmall \
    --train_file data/train.txt \
    --per_device_train_batch_size 16 \
    --per_device_eval_batch_size 8 \
    --num_train_epochs 150 \
    --learning_rate 1.5e-4 \
    --output_dir model/fp16 \
    --overwrite_output_dir \
    --fp16 \
    --logging_steps 10 \
    --enable_quant false

fp16

-l

python3 export.py \
    -m model/fp16/pytorch_model.bin \
    -l 500

fp16

-p

python3 generate.py \
    -m model/fp16/pytorch_model.hdf5 \
    -i "" \
    -p "uer/gpt2-chinese-cluecorpussmall"

int8

fp16pretrain

fp16fp16int8

python3 -m torch.distributed.launch \
    --nproc_per_node=8 \
    train.py \
    --model_name_or_path uer/gpt2-chinese-cluecorpussmall \
    --train_file data/train.txt \
    --per_device_train_batch_size 16 \
    --per_device_eval_batch_size 8 \
    --num_train_epochs 150 \
    --learning_rate 1.5e-4 \
    --output_dir model/fp16 \
    --overwrite_output_dir \
    --fp16 \
    --logging_steps 10 \
    --enable_quant false

int8finetune

fp16int8finetune

python3 -m torch.distributed.launch \
    --nproc_per_node=8 \
    train.py \
    --model_name_or_path uer/gpt2-chinese-cluecorpussmall \
    --train_file data/train.txt \
    --per_device_train_batch_size 16 \
    --per_device_eval_batch_size 8 \
    --num_train_epochs 200 \
    --learning_rate 5e-6 \
    --output_dir model/int8 \
    --overwrite_output_dir \
    --resume_from_checkpoint model/fp16 \
    --fp16 \
    --logging_steps 10 \
    --enable_quant true

int8

int8-qint8

python3 export.py \
    -m model/int8/pytorch_model.bin \
    -l 500 \
    -q

int8

-qint8

python3 generate.py \
    -m model/int8/pytorch_model.hdf5 \
    -i "" \
    -p "uer/gpt2-chinese-cluecorpussmall" \
    -q