A demo of GPT2 model trained and infered with LightSeq
A100
T4500:
train.txt
500
python3 -m torch.distributed.launch \
--nproc_per_node=8 \
train.py \
--model_name_or_path uer/gpt2-chinese-cluecorpussmall \
--train_file data/train.txt \
--per_device_train_batch_size 16 \
--per_device_eval_batch_size 8 \
--num_train_epochs 150 \
--learning_rate 1.5e-4 \
--output_dir model/fp16 \
--overwrite_output_dir \
--fp16 \
--logging_steps 10 \
--enable_quant false
-l
python3 export.py \
-m model/fp16/pytorch_model.bin \
-l 500
-p
python3 generate.py \
-m model/fp16/pytorch_model.hdf5 \
-i "" \
-p "uer/gpt2-chinese-cluecorpussmall"
fp16fp16int8
python3 -m torch.distributed.launch \
--nproc_per_node=8 \
train.py \
--model_name_or_path uer/gpt2-chinese-cluecorpussmall \
--train_file data/train.txt \
--per_device_train_batch_size 16 \
--per_device_eval_batch_size 8 \
--num_train_epochs 150 \
--learning_rate 1.5e-4 \
--output_dir model/fp16 \
--overwrite_output_dir \
--fp16 \
--logging_steps 10 \
--enable_quant false
fp16int8finetune
python3 -m torch.distributed.launch \
--nproc_per_node=8 \
train.py \
--model_name_or_path uer/gpt2-chinese-cluecorpussmall \
--train_file data/train.txt \
--per_device_train_batch_size 16 \
--per_device_eval_batch_size 8 \
--num_train_epochs 200 \
--learning_rate 5e-6 \
--output_dir model/int8 \
--overwrite_output_dir \
--resume_from_checkpoint model/fp16 \
--fp16 \
--logging_steps 10 \
--enable_quant true
int8-q
int8
python3 export.py \
-m model/int8/pytorch_model.bin \
-l 500 \
-q
-q
int8
python3 generate.py \
-m model/int8/pytorch_model.hdf5 \
-i "" \
-p "uer/gpt2-chinese-cluecorpussmall" \
-q