@@ -314,10 +314,14 @@ async def create_completion(
314314 else :
315315 kwargs ["logits_processor" ].extend (_min_tokens_logits_processor )
316316
317- iterator_or_completion : Union [
318- llama_cpp .CreateCompletionResponse ,
319- Iterator [llama_cpp .CreateCompletionStreamResponse ],
320- ] = await run_in_threadpool (llama , ** kwargs )
317+ try :
318+ iterator_or_completion : Union [
319+ llama_cpp .CreateCompletionResponse ,
320+ Iterator [llama_cpp .CreateCompletionStreamResponse ],
321+ ] = await run_in_threadpool (llama , ** kwargs )
322+ except Exception as err :
323+ exit_stack .close ()
324+ raise err
321325
322326 if isinstance (iterator_or_completion , Iterator ):
323327 # EAFP: It's easier to ask for forgiveness than permission
@@ -344,6 +348,7 @@ def iterator() -> Iterator[llama_cpp.CreateCompletionStreamResponse]:
344348 ping_message_factory = _ping_message_factory ,
345349 )
346350 else :
351+ exit_stack .close ()
347352 return iterator_or_completion
348353
349354
@@ -508,9 +513,13 @@ async def create_chat_completion(
508513 else :
509514 kwargs ["logits_processor" ].extend (_min_tokens_logits_processor )
510515
511- iterator_or_completion : Union [
512- llama_cpp .ChatCompletion , Iterator [llama_cpp .ChatCompletionChunk ]
513- ] = await run_in_threadpool (llama .create_chat_completion , ** kwargs )
516+ try :
517+ iterator_or_completion : Union [
518+ llama_cpp .ChatCompletion , Iterator [llama_cpp .ChatCompletionChunk ]
519+ ] = await run_in_threadpool (llama .create_chat_completion , ** kwargs )
520+ except Exception as err :
521+ exit_stack .close ()
522+ raise err
514523
515524 if isinstance (iterator_or_completion , Iterator ):
516525 # EAFP: It's easier to ask for forgiveness than permission
0 commit comments