Word Document Element preservation (I know this is not related to PDF's, but there are no other resources for help) #4465
Unanswered
Prasaderp
asked this question in
Looking for help
Replies: 0 comments
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
-
Hi,
I know that this issue is not related to PDF's, but there are no other resources for help.
So, currently I am working on translating word document from English to Indic languages which includes many element like graphs, Images, tables etc.
I was using python-docx and xlmx tree libraries but none of them are able to preserve IMAGES as elements.
see I have attached a screenshot for the same below.
`import os
import re
import torch
import gc
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from docx import Document
import time
import subprocess
import shutil
Configuration
MODEL_NAME = "facebook/nllb-200-distilled-1.3B"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
LANGUAGES = {
"Hindi": {"code": "hin_Deva", "iso": "hi"},
"Tamil": {"code": "tam_Taml", "iso": "ta"},
"Telugu": {"code": "tel_Telu", "iso": "te"}
}
MEMORY_THRESHOLD = 0.7 # Lowered to trigger memory reset earlier
MAX_LENGTH_DEFAULT = 256
MAX_TOKENS_PER_BLOCK = 200
Initialize the translation model and tokenizer
def initialize_model():
print("Initializing translation model...")
start = time.time()
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, src_lang="eng_Latn")
model = AutoModelForSeq2SeqLM.from_pretrained(
MODEL_NAME,
torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32
).to(DEVICE).eval()
print(f"Model loaded in {time.time() - start:.2f}s")
return tokenizer, model
tokenizer, model = initialize_model()
Utility Functions
def parse_user_entities(user_input):
entities = [e.strip() for e in user_input.split(',') if e.strip()]
print(f" Entities to preserve: {', '.join(entities) if entities else 'None'}")
return sorted(set(entities), key=len, reverse=True)
def chunk_text_blocks(text, max_tokens=MAX_TOKENS_PER_BLOCK):
"""Split text into blocks of approximately max_tokens, respecting sentence boundaries"""
sentence_boundaries = [m.end() for m in re.finditer(r'[.!?]', text)]
chunks = []
last_end = 0
current_chunk = ""
current_tokens = 0
def replace_with_placeholders(text, entities):
placeholder_map = {}
modified_text = text
def needs_translation(modified_text):
cleaned = re.sub(r'PRESERVE\d{3}', '', modified_text)
return bool(re.search(r'[a-zA-Z]', cleaned))
def restore_entities(text, placeholder_map, original_text):
restored_text = text
restored_entities = []
for placeholder, original in placeholder_map.items():
if placeholder in restored_text:
restored_text = restored_text.replace(placeholder, original, 1)
restored_entities.append(original)
print(f" Restored '{original}' at placeholder '{placeholder}'")
else:
original_pos = original_text.find(original)
if original_pos != -1 and original not in restored_text:
ratio = len(restored_text) / len(original_text) if len(original_text) > 0 else 1
approx_pos = int(original_pos * ratio)
restored_text = restored_text[:approx_pos] + original + restored_text[approx_pos:]
restored_entities.append(original)
print(f" Placeholder '{placeholder}' not found; inserted '{original}' at estimated position")
elif original not in restored_text:
restored_text = f"{original} {restored_text}"
restored_entities.append(original)
print(f" Placeholder '{placeholder}' not found; prepended '{original}'")
def split_text_by_proportions(text, runs, original_text, placeholder_map):
if not runs or not text.strip():
return [""] * len(runs)
def get_dynamic_batch_size(num_texts, fast_mode=False):
if DEVICE != "cuda":
return min(16, num_texts)
total_memory = torch.cuda.get_device_properties(0).total_memory
free_memory = total_memory - torch.cuda.memory_allocated()
tokens_per_text = MAX_LENGTH_DEFAULT
bytes_per_text = tokens_per_text * 4
max_batch = max(1, min(free_memory // bytes_per_text, num_texts))
return min(64 if fast_mode else 16, max_batch)
def translate_batch(texts, target_lang="Hindi", fast_mode=False):⚠️ Memory error: {e}. Reducing batch size and retrying...")
if not texts:
return []
batch_size = get_dynamic_batch_size(len(texts), fast_mode)
translated_texts = []
target_lang_code = LANGUAGES[target_lang]["code"]
for i in range(0, len(texts), batch_size):
batch = texts[i:i + batch_size]
max_length = max(MAX_LENGTH_DEFAULT, max(len(t.split()) for t in batch) * 2)
try:
inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=max_length).to(DEVICE)
with torch.no_grad():
outputs = model.generate(
inputs,
forced_bos_token_id=tokenizer.convert_tokens_to_ids(target_lang_code),
max_length=max_length,
num_beams=3,
use_cache=True,
early_stopping=True
)
translated = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
translated_texts.extend([re.sub(r'^.+|\s.+$|^\s…', '', t.strip()) for t in translated])
del inputs, outputs
if DEVICE == "cuda":
torch.cuda.empty_cache()
gc.collect()
except RuntimeError as e:
print(f"
if batch_size > 1:
batch_size = max(1, batch_size // 2)
translated_texts.extend(translate_batch(batch, target_lang, fast_mode))
else:
raise
return translated_texts
def reset_gpu_memory():
global model, tokenizer
if DEVICE == "cuda":
del model
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
gc.collect()
print(" Refreshing GPU memory...")
start = time.time()
tokenizer, model = initialize_model()
print(f" GPU memory refreshed in {time.time()-start:.2f}s")
def check_memory_and_reset(total_segments):
if DEVICE != "cuda" or total_segments <= 100:
return False
total_memory = torch.cuda.get_device_properties(0).total_memory
allocated_memory = torch.cuda.memory_allocated()
if allocated_memory / total_memory > MEMORY_THRESHOLD:
reset_gpu_memory()
return True
return False
Document Processing Functions
def collect_texts(doc, entities):
texts = []
# Body paragraphs
for para_idx, para in enumerate(doc.paragraphs):
if para.text.strip():
modified_text, placeholder_map = replace_with_placeholders(para.text, entities)
needs_trans = needs_translation(modified_text)
texts.append(("body", para_idx, para, para.text, modified_text, placeholder_map, needs_trans))
def process_document(input_path, output_path, entities, target_lang="Hindi"):
# Load the original document
doc = Document(input_path)
texts = collect_texts(doc, entities)
if not texts:
print("No translatable text found in the document.")
doc.save(output_path)
return
def assign_translated_text(para, translated_text, original_text, placeholder_map):
runs = [run for run in para.runs]
if not runs:
if para.text.strip():
run = para.add_run(translated_text)
# Preserve basic formatting from the first run if it exists
if para.runs and len(para.runs) > 1:
run.bold = para.runs[0].bold
run.italic = para.runs[0].italic
run.underline = para.runs[0].underline
return
Conversion Function
def convert_doc_to_docx(doc_path):
if not doc_path.endswith('.doc'):
return doc_path
Main Execution
if name == "main":
doc_path = "/content/AIMT2002101_English_Quant.docx"
output_path = "/content/translated_output.docx"
Beta Was this translation helpful? Give feedback.
All reactions