@@ -314,10 +314,14 @@ async def create_completion(
314
314
else :
315
315
kwargs ["logits_processor" ].extend (_min_tokens_logits_processor )
316
316
317
- iterator_or_completion : Union [
318
- llama_cpp .CreateCompletionResponse ,
319
- Iterator [llama_cpp .CreateCompletionStreamResponse ],
320
- ] = await run_in_threadpool (llama , ** kwargs )
317
+ try :
318
+ iterator_or_completion : Union [
319
+ llama_cpp .CreateCompletionResponse ,
320
+ Iterator [llama_cpp .CreateCompletionStreamResponse ],
321
+ ] = await run_in_threadpool (llama , ** kwargs )
322
+ except Exception as err :
323
+ exit_stack .close ()
324
+ raise err
321
325
322
326
if isinstance (iterator_or_completion , Iterator ):
323
327
# EAFP: It's easier to ask for forgiveness than permission
@@ -344,6 +348,7 @@ def iterator() -> Iterator[llama_cpp.CreateCompletionStreamResponse]:
344
348
ping_message_factory = _ping_message_factory ,
345
349
)
346
350
else :
351
+ exit_stack .close ()
347
352
return iterator_or_completion
348
353
349
354
@@ -508,9 +513,13 @@ async def create_chat_completion(
508
513
else :
509
514
kwargs ["logits_processor" ].extend (_min_tokens_logits_processor )
510
515
511
- iterator_or_completion : Union [
512
- llama_cpp .ChatCompletion , Iterator [llama_cpp .ChatCompletionChunk ]
513
- ] = await run_in_threadpool (llama .create_chat_completion , ** kwargs )
516
+ try :
517
+ iterator_or_completion : Union [
518
+ llama_cpp .ChatCompletion , Iterator [llama_cpp .ChatCompletionChunk ]
519
+ ] = await run_in_threadpool (llama .create_chat_completion , ** kwargs )
520
+ except Exception as err :
521
+ exit_stack .close ()
522
+ raise err
514
523
515
524
if isinstance (iterator_or_completion , Iterator ):
516
525
# EAFP: It's easier to ask for forgiveness than permission
0 commit comments