Skip to content

Commit 0b73e12

Browse files
authored
remove load state as np. (PaddlePaddle#7120)
1 parent c1157e5 commit 0b73e12

File tree

23 files changed

+12
-38
lines changed

23 files changed

+12
-38
lines changed

examples/benchmark/ceval/model_evaluator.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def __init__(self, choices, k, model_name_or_path, temperature=0.2):
3030
super().__init__(choices, model_name_or_path, k)
3131
self.model_name_or_path = model_name_or_path
3232
self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
33-
self.model = AutoModelForCausalLM.from_pretrained(model_name_or_path, dtype="float16", low_cpu_mem_usage=True)
33+
self.model = AutoModelForCausalLM.from_pretrained(model_name_or_path, dtype="float16")
3434
self.model.eval()
3535
self.generation_config = dict(
3636
temperature=temperature,

examples/benchmark/peft/paddle/benchmark.py

-2
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,6 @@ def main():
9292
if model_args.model_name_or_path in ["gpt3-6.7B-en", "gpt3-13B-en"]:
9393
model = GPTForCausalLM.from_pretrained(
9494
model_args.model_name_or_path,
95-
low_cpu_mem_usage=True,
9695
use_flash_attention=model_args.use_flash_attention,
9796
dtype=dtype,
9897
tensor_parallel_degree=training_args.tensor_parallel_degree,
@@ -104,7 +103,6 @@ def main():
104103
else:
105104
model = AutoModelForCausalLM.from_pretrained(
106105
model_args.model_name_or_path,
107-
low_cpu_mem_usage=True,
108106
use_flash_attention=model_args.use_flash_attention,
109107
dtype=dtype,
110108
tensor_parallel_degree=training_args.tensor_parallel_degree,

examples/benchmark/peft/paddle/inference_benchmark.py

-2
Original file line numberDiff line numberDiff line change
@@ -67,8 +67,6 @@ def predict_forward(model, inputs):
6767
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
6868
model = AutoModelForCausalLM.from_pretrained(
6969
args.model_name_or_path,
70-
load_state_as_np=True,
71-
low_cpu_mem_usage=True,
7270
)
7371
if model.base_model_prefix == "llama":
7472
tokenizer.pad_token = tokenizer.unk_token

examples/code_generation/codegen/README.md

-1
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,6 @@ python codegen_server.py
119119
- `min_length`:生成的最小长度,默认为0
120120
- `max_length`:生成的最大长度,默认为16
121121
- `decode_strategy`:解码策略,默认为"greedy_search"
122-
- `load_state_as_np`:以numpy格式加载模型参数,可节省显存,默认为True
123122
- `use_fast`:是否使用FastGeneration,可加速推理,默认为True
124123
- `use_fp16_decoding`:是否使用fp16推理,可节省显存和加速推理,默认为True
125124

examples/code_generation/codegen/codegen_server.py

+1-4
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@ class DefaultConfig:
3535
min_length = 0
3636
max_length = 16
3737
decode_strategy = "greedy_search"
38-
load_state_as_np = True
3938
use_faster = True
4039
use_fp16_decoding = True
4140
default_dtype = "float16" if use_faster and use_fp16_decoding else "float32"
@@ -64,9 +63,7 @@ class Output(BaseModel):
6463
paddle.set_default_dtype(generate_config.default_dtype)
6564

6665
tokenizer = CodeGenTokenizer.from_pretrained(generate_config.model_name_or_path)
67-
model = CodeGenForCausalLM.from_pretrained(
68-
generate_config.model_name_or_path, load_state_as_np=generate_config.load_state_as_np
69-
)
66+
model = CodeGenForCausalLM.from_pretrained(generate_config.model_name_or_path)
7067

7168
app = FastAPI()
7269

examples/language_model/t5/tests/t5_mp.py

-3
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,6 @@ def main():
4444
tensor_parallel_degree=tensor_parallel_degree,
4545
tensor_parallel_rank=tensor_parallel_rank,
4646
dtype="float32",
47-
low_cpu_mem_usage=True,
4847
)
4948
model.eval()
5049
loss = model(
@@ -63,7 +62,6 @@ def main():
6362
tensor_parallel_degree=tensor_parallel_degree,
6463
tensor_parallel_rank=tensor_parallel_rank,
6564
dtype="float32",
66-
low_cpu_mem_usage=True,
6765
)
6866
load_model.eval()
6967
loss = load_model(
@@ -85,7 +83,6 @@ def main():
8583
tensor_parallel_degree=tensor_parallel_degree,
8684
tensor_parallel_rank=tensor_parallel_rank,
8785
dtype="float32",
88-
low_cpu_mem_usage=True,
8986
)
9087
load_model.eval()
9188
loss = load_model(

examples/text_generation/opt/demo.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ class Demo:
2323
def __init__(self, model_name_or_path, max_predict_len=128):
2424
self.tokenizer = GPTTokenizer.from_pretrained(model_name_or_path)
2525
logger.info("Loading the model parameters, please wait...")
26-
self.model = OPTForCausalLM.from_pretrained(model_name_or_path, load_state_as_np=True)
26+
self.model = OPTForCausalLM.from_pretrained(model_name_or_path)
2727
self.model.eval()
2828
self.max_predict_len = max_predict_len
2929
logger.info("Model loaded.")

fast_generation/perf/codegen_perf.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def perf_pd(args):
3737
place = "gpu"
3838
place = paddle.set_device(place)
3939
tokenizer = CodeGenTokenizer.from_pretrained(args.model_name_or_path)
40-
model = CodeGenForCausalLM.from_pretrained(args.model_name_or_path, load_state_as_np=True)
40+
model = CodeGenForCausalLM.from_pretrained(args.model_name_or_path)
4141
model.eval()
4242
load_mem = query_by_id(args.gpu_id)
4343

fast_generation/perf/pegasus_perf.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def perf_pd(args):
4040
place = "gpu"
4141
place = paddle.set_device(place)
4242
tokenizer = PegasusChineseTokenizer.from_pretrained(args.model_name_or_path)
43-
model = PegasusForConditionalGeneration.from_pretrained(args.model_name_or_path, load_state_as_np=True)
43+
model = PegasusForConditionalGeneration.from_pretrained(args.model_name_or_path)
4444
model.eval()
4545
load_mem = query_by_id(args.gpu_id)
4646
input_ids_np = [np.random.choice(range(len(tokenizer.vocab)), args.input_len) for _ in range(args.batch_size)]

fast_generation/samples/codegen_16b_sample.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
model_name = "Salesforce/codegen-16B-mono"
2222

2323
tokenizer = CodeGenTokenizer.from_pretrained(model_name)
24-
model = CodeGenForCausalLM.from_pretrained(model_name, load_state_as_np=True)
24+
model = CodeGenForCausalLM.from_pretrained(model_name)
2525
model.eval()
2626

2727
inputs = "def hello"

fast_generation/samples/gpt_mp_sample.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ def main(args):
9696
if args.profile:
9797
MODEL_CLASSES[model_name][0].generate = profile(args.batch_size)(MODEL_CLASSES[model_name][0].generate)
9898
tokenizer = MODEL_CLASSES[model_name][-1].from_pretrained(model_name)
99-
model = MODEL_CLASSES[model_name][0].from_pretrained(model_name, load_state_as_np=True)
99+
model = MODEL_CLASSES[model_name][0].from_pretrained(model_name)
100100
model.eval()
101101

102102
# NOTE: When using prompt, open this and replace the text with what you want.

fast_generation/samples/gptj_sample.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
model_name = "EleutherAI/gpt-j-6B"
2121

2222
tokenizer = GPTJTokenizer.from_pretrained(model_name)
23-
model = GPTJForCausalLM.from_pretrained(model_name, load_state_as_np=True)
23+
model = GPTJForCausalLM.from_pretrained(model_name)
2424
model.eval()
2525

2626
inputs = "What is PaddleNLP?"

fast_generation/samples/plato_xl_sample.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ def main(args):
106106
if args.profile:
107107
UnifiedTransformerLMHeadModel.generate = profile(args.batch_size)(UnifiedTransformerLMHeadModel.generate)
108108
tokenizer = UnifiedTransformerTokenizer.from_pretrained("plato-xl")
109-
model = UnifiedTransformerLMHeadModel.from_pretrained("plato-xl", load_state_as_np=True)
109+
model = UnifiedTransformerLMHeadModel.from_pretrained("plato-xl")
110110
model.eval()
111111

112112
history = [

llm/ernie-3.5-se/predict_generation.py

-1
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,6 @@ def __init__(self, args=None, tokenizer=None, model=None, **kwargs):
9999
args.model_name_or_path,
100100
tensor_parallel_degree=tensor_parallel_degree,
101101
tensor_parallel_rank=tensor_parallel_rank,
102-
load_state_as_np=True,
103102
dtype=dtype,
104103
use_flash_attention=use_flash_attn,
105104
)

llm/ernie-3.5-se/run_pretrain.py

-1
Original file line numberDiff line numberDiff line change
@@ -394,7 +394,6 @@ def main():
394394
model_args.model_name_or_path,
395395
config=config,
396396
dtype=dtype,
397-
load_state_as_np=True,
398397
use_progressive_seq_len=True,
399398
)
400399
else:

llm/glm/finetune_generation.py

-1
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,6 @@ def main():
109109
model_args.model_name_or_path,
110110
output_predict=True,
111111
parallel_output=True,
112-
load_state_as_np=True,
113112
dtype=dtype, # todo enable set dtype to avoid additional mem usage
114113
tensor_parallel_degree=training_args.tensor_parallel_degree,
115114
tensor_parallel_rank=training_args.tensor_parallel_rank,

llm/glm/predict_generation.py

-2
Original file line numberDiff line numberDiff line change
@@ -80,9 +80,7 @@ def __init__(self, args):
8080
args.model_name_or_path,
8181
tensor_parallel_degree=tensor_parallel_degree,
8282
tensor_parallel_rank=tensor_parallel_rank,
83-
load_state_as_np=True,
8483
dtype=dtype,
85-
low_cpu_mem_usage=True,
8684
)
8785
if self.args.lora_path is not None:
8886
self.model = LoRAModel.from_pretrained(self.model, self.args.lora_path)

llm/gpt-3/finetune_generation.py

-1
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,6 @@ def main():
153153
model_args.model_name_or_path,
154154
config=config,
155155
dtype=dtype,
156-
load_state_as_np=True,
157156
)
158157
if model_args.lora:
159158
if model_args.lora_path is None:

llm/gpt-3/predict_generation.py

-2
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,6 @@ def __init__(self, args=None, tokenizer=None, model=None, **kwargs):
7373

7474
self.model = GPTForCausalLM.from_pretrained(
7575
args.model_name_or_path,
76-
load_state_as_np=True,
77-
low_cpu_mem_usage=True,
7876
dtype=dtype,
7977
tensor_parallel_degree=tensor_parallel_degree,
8078
tensor_parallel_rank=tensor_parallel_rank,

model_zoo/plato-xl/infer.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ def main(args):
112112
if args.profile:
113113
UnifiedTransformerLMHeadModel.generate = profile(args.batch_size)(UnifiedTransformerLMHeadModel.generate)
114114
tokenizer = UnifiedTransformerTokenizer.from_pretrained("plato-xl")
115-
model = UnifiedTransformerLMHeadModel.from_pretrained("plato-xl", load_state_as_np=True)
115+
model = UnifiedTransformerLMHeadModel.from_pretrained("plato-xl")
116116
model.eval()
117117

118118
history = [

paddlenlp/ops/fast_transformer/sample/plato_export_model_sample.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ def do_predict(args):
7070
paddle.set_default_dtype("float16")
7171

7272
model_name = "plato-xl"
73-
model = UnifiedTransformerLMHeadModel.from_pretrained(model_name, load_state_as_np=True)
73+
model = UnifiedTransformerLMHeadModel.from_pretrained(model_name)
7474
tokenizer = UnifiedTransformerTokenizer.from_pretrained(model_name)
7575

7676
plato = FasterUnifiedTransformer(model=model, use_fp16_decoding=args.use_fp16_decoding)

paddlenlp/taskflow/text2text_generation.py

-1
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,6 @@ def _construct_model(self, model):
117117
"""
118118
model_instance = AutoModelForCausalLM.from_pretrained(
119119
self.model,
120-
load_state_as_np=True,
121120
dtype=self._dtype,
122121
)
123122
# Load the model parameter for the predict

tests/transformers/test_modeling_utils.py

+1-7
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
from multiprocessing import Pool
1919
from tempfile import TemporaryDirectory
2020

21-
from paddlenlp.transformers import BertModel, TinyBertModel
21+
from paddlenlp.transformers import BertModel
2222
from paddlenlp.utils.env import CONFIG_NAME, MODEL_HOME, PADDLE_WEIGHTS_NAME
2323
from tests.testing_utils import slow
2424

@@ -57,12 +57,6 @@ def test_from_pretrained_cache_dir_pretrained_init(self):
5757
# check against double appending model_name in cache_dir
5858
self.assertFalse(os.path.exists(os.path.join(tempdir, model_name, model_name)))
5959

60-
@slow
61-
def test_from_pretrained_with_load_as_state_np_params(self):
62-
"""init model with `load_state_as_np` params"""
63-
model = TinyBertModel.from_pretrained("tinybert-4l-312d", load_state_as_np=True)
64-
self.assertIsNotNone(model)
65-
6660
@slow
6761
def test_multiprocess_downloading(self):
6862
"""test downloading with multi-process. Some errors may be triggered when downloading model

0 commit comments

Comments
 (0)