Word Document Element preservation (I know this is not related to PDF's, but there are no other resources for help) #4465

Prasaderp · 2025-04-23T03:06:03Z

Prasaderp
Apr 23, 2025

Hi,
I know that this issue is not related to PDF's, but there are no other resources for help.

So, currently I am working on translating word document from English to Indic languages which includes many element like graphs, Images, tables etc.
I was using python-docx and xlmx tree libraries but none of them are able to preserve IMAGES as elements.

see I have attached a screenshot for the same below.

`import os
import re
import torch
import gc
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from docx import Document
import time
import subprocess
import shutil

Configuration

MODEL_NAME = "facebook/nllb-200-distilled-1.3B"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
LANGUAGES = {
"Hindi": {"code": "hin_Deva", "iso": "hi"},
"Tamil": {"code": "tam_Taml", "iso": "ta"},
"Telugu": {"code": "tel_Telu", "iso": "te"}
}
MEMORY_THRESHOLD = 0.7 # Lowered to trigger memory reset earlier
MAX_LENGTH_DEFAULT = 256
MAX_TOKENS_PER_BLOCK = 200

Initialize the translation model and tokenizer

def initialize_model():
print("Initializing translation model...")
start = time.time()
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, src_lang="eng_Latn")
model = AutoModelForSeq2SeqLM.from_pretrained(
MODEL_NAME,
torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32
).to(DEVICE).eval()
print(f"Model loaded in {time.time() - start:.2f}s")
return tokenizer, model

tokenizer, model = initialize_model()

Utility Functions

def parse_user_entities(user_input):
entities = [e.strip() for e in user_input.split(',') if e.strip()]
print(f" Entities to preserve: {', '.join(entities) if entities else 'None'}")
return sorted(set(entities), key=len, reverse=True)

def chunk_text_blocks(text, max_tokens=MAX_TOKENS_PER_BLOCK):
"""Split text into blocks of approximately max_tokens, respecting sentence boundaries"""
sentence_boundaries = [m.end() for m in re.finditer(r'[.!?]', text)]
chunks = []
last_end = 0
current_chunk = ""
current_tokens = 0

for end in sentence_boundaries:
    sentence = text[last_end:end]
    sentence_tokens = len(sentence.split())
    if current_tokens + sentence_tokens > max_tokens or len(current_chunk) > max_tokens * 4:
        if current_chunk:
            chunks.append(current_chunk)
            current_tokens = 0
        current_chunk = sentence
        current_tokens = sentence_tokens
    else:
        current_chunk += sentence
        current_tokens += sentence_tokens
    last_end = end

if last_end < len(text):
    remaining = text[last_end:]
    remaining_tokens = len(remaining.split())
    if current_tokens + remaining_tokens > max_tokens or len(current_chunk) > max_tokens * 4:
        if current_chunk:
            chunks.append(current_chunk)
        current_chunk = remaining
    else:
        current_chunk += remaining

if current_chunk:
    chunks.append(current_chunk)

return chunks

def replace_with_placeholders(text, entities):
placeholder_map = {}
modified_text = text

patterns = [
    (re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'), "emails"),
    (re.compile(r'https?://\S+|www\.\S+|github\.com/\S+|linkedin\.com/\S+'), "URLs"),
    (re.compile(r'\+\d{2}-\d+'), "phone_numbers"),
    (re.compile(r'CGPA:\s*\d+\.\d+'), "CGPA"),
    (re.compile(r'\b\d{4}\s*–\s*\d{2}\b'), "year_ranges"),
    (re.compile(r'\|'), "separators")
]

for pattern, category in patterns:
    matches = pattern.findall(modified_text)
    for match in matches:
        placeholder = f"__PRESERVE{len(placeholder_map):03d}__"
        placeholder_map[placeholder] = match
        modified_text = modified_text.replace(match, placeholder, 1)
        print(f" Preserved {category}: '{match}'")

for entity in entities:
    pattern = re.compile(r'\b' + re.escape(entity) + r'\b', re.IGNORECASE)
    def replacer(match):
        original = match.group()
        placeholder = f"__PRESERVE{len(placeholder_map):03d}__"
        placeholder_map[placeholder] = original
        return placeholder
    modified_text, count = pattern.subn(replacer, modified_text, 1)
    if count > 0:
        print(f" Preserved entity '{entity}' {count} time(s)")

print(f" Text with placeholders: '{modified_text[:50]}...'")
return modified_text, placeholder_map

def needs_translation(modified_text):
cleaned = re.sub(r'PRESERVE\d{3}', '', modified_text)
return bool(re.search(r'[a-zA-Z]', cleaned))

def restore_entities(text, placeholder_map, original_text):
restored_text = text
restored_entities = []
for placeholder, original in placeholder_map.items():
if placeholder in restored_text:
restored_text = restored_text.replace(placeholder, original, 1)
restored_entities.append(original)
print(f" Restored '{original}' at placeholder '{placeholder}'")
else:
original_pos = original_text.find(original)
if original_pos != -1 and original not in restored_text:
ratio = len(restored_text) / len(original_text) if len(original_text) > 0 else 1
approx_pos = int(original_pos * ratio)
restored_text = restored_text[:approx_pos] + original + restored_text[approx_pos:]
restored_entities.append(original)
print(f" Placeholder '{placeholder}' not found; inserted '{original}' at estimated position")
elif original not in restored_text:
restored_text = f"{original} {restored_text}"
restored_entities.append(original)
print(f" Placeholder '{placeholder}' not found; prepended '{original}'")

# Cleanup any leftover placeholders
leftover_placeholders = re.findall(r'__PRESERVE\d{3}__', restored_text)
if leftover_placeholders:
    print(f"⚠️ Found leftover placeholders: {leftover_placeholders}. Removing them.")
    restored_text = re.sub(r'__PRESERVE\d{3}__', '', restored_text)

if restored_entities:
    print(f" Restored entities: {restored_entities}")
return restored_text.strip()

def split_text_by_proportions(text, runs, original_text, placeholder_map):
if not runs or not text.strip():
return [""] * len(runs)

original_texts = [run.text.strip() for run in runs]
parts = []

if len(runs) == 1 or all(not t for t in original_texts[1:]):
    return [text] + [""] * (len(runs) - 1)

structural_markers = any(t in ("|", ":", "-", "•", " ") for t in original_texts)
if not structural_markers:
    for i, t in enumerate(original_texts):
        if t:
            parts = [""] * i + [text] + [""] * (len(runs) - i - 1)
            break
    else:
        parts = [text] + [""] * (len(runs) - 1)
else:
    total_length = sum(len(t) for t in original_texts if t)
    remaining_text = text
    for i, original in enumerate(original_texts):
        if not original:
            parts.append("")
            continue
        if original in ("|", ":", "-", "•", " "):
            parts.append(original)
            continue
        split_length = max(1, int(len(remaining_text) * (len(original) / total_length))) if total_length > 0 else len(remaining_text)
        split_point = remaining_text.find(" ", split_length) if " " in remaining_text[split_length:] else len(remaining_text)
        if split_point == -1:
            split_point = split_length
        part = remaining_text[:split_point].strip()
        parts.append(part)
        remaining_text = remaining_text[split_point:].strip()

while len(parts) < len(runs):
    parts.append("")
return parts[:len(runs)]

def get_dynamic_batch_size(num_texts, fast_mode=False):
if DEVICE != "cuda":
return min(16, num_texts)
total_memory = torch.cuda.get_device_properties(0).total_memory
free_memory = total_memory - torch.cuda.memory_allocated()
tokens_per_text = MAX_LENGTH_DEFAULT
bytes_per_text = tokens_per_text * 4
max_batch = max(1, min(free_memory // bytes_per_text, num_texts))
return min(64 if fast_mode else 16, max_batch)

def translate_batch(texts, target_lang="Hindi", fast_mode=False):
if not texts:
return []
batch_size = get_dynamic_batch_size(len(texts), fast_mode)
translated_texts = []
target_lang_code = LANGUAGES[target_lang]["code"]
for i in range(0, len(texts), batch_size):
batch = texts[i:i + batch_size]
max_length = max(MAX_LENGTH_DEFAULT, max(len(t.split()) for t in batch) * 2)
try:
inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=max_length).to(DEVICE)
with torch.no_grad():
outputs = model.generate(
inputs,
forced_bos_token_id=tokenizer.convert_tokens_to_ids(target_lang_code),
max_length=max_length,
num_beams=3,
use_cache=True,
early_stopping=True
)
translated = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
translated_texts.extend([re.sub(r'^.+|\s.+$|^\s…', '', t.strip()) for t in translated])
del inputs, outputs
if DEVICE == "cuda":
torch.cuda.empty_cache()
gc.collect()
except RuntimeError as e:
print(f"⚠️ Memory error: {e}. Reducing batch size and retrying...")
if batch_size > 1:
batch_size = max(1, batch_size // 2)
translated_texts.extend(translate_batch(batch, target_lang, fast_mode))
else:
raise
return translated_texts

def reset_gpu_memory():
global model, tokenizer
if DEVICE == "cuda":
del model
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
gc.collect()
print(" Refreshing GPU memory...")
start = time.time()
tokenizer, model = initialize_model()
print(f" GPU memory refreshed in {time.time()-start:.2f}s")

def check_memory_and_reset(total_segments):
if DEVICE != "cuda" or total_segments <= 100:
return False
total_memory = torch.cuda.get_device_properties(0).total_memory
allocated_memory = torch.cuda.memory_allocated()
if allocated_memory / total_memory > MEMORY_THRESHOLD:
reset_gpu_memory()
return True
return False

Document Processing Functions

def collect_texts(doc, entities):
texts = []
# Body paragraphs
for para_idx, para in enumerate(doc.paragraphs):
if para.text.strip():
modified_text, placeholder_map = replace_with_placeholders(para.text, entities)
needs_trans = needs_translation(modified_text)
texts.append(("body", para_idx, para, para.text, modified_text, placeholder_map, needs_trans))

# Headers and footers
for section_idx, section in enumerate(doc.sections):
    for header_idx, header in enumerate(section.header.paragraphs):
        if header.text.strip():
            modified_text, placeholder_map = replace_with_placeholders(header.text, entities)
            needs_trans = needs_translation(modified_text)
            texts.append(("header", (section_idx, header_idx), header, header.text, modified_text, placeholder_map, needs_trans))
    for footer_idx, footer in enumerate(section.footer.paragraphs):
        if footer.text.strip():
            modified_text, placeholder_map = replace_with_placeholders(footer.text, entities)
            needs_trans = needs_translation(modified_text)
            texts.append(("footer", (section_idx, footer_idx), footer, footer.text, modified_text, placeholder_map, needs_trans))

# Tables
for table_idx, table in enumerate(doc.tables):
    for row_idx, row in enumerate(table.rows):
        for cell_idx, cell in enumerate(row.cells):
            for para_idx, para in enumerate(cell.paragraphs):
                if para.text.strip():
                    modified_text, placeholder_map = replace_with_placeholders(para.text, entities)
                    needs_trans = needs_translation(modified_text)
                    texts.append(("table", (table_idx, row_idx, cell_idx, para_idx), para, para.text, modified_text, placeholder_map, needs_trans))
return texts

def process_document(input_path, output_path, entities, target_lang="Hindi"):
# Load the original document
doc = Document(input_path)
texts = collect_texts(doc, entities)
if not texts:
print("No translatable text found in the document.")
doc.save(output_path)
return

total_segments = len([t for t in texts if t[6]])  # Count segments needing translation
fast_mode = total_segments <= 50
to_translate = []
translation_metadata = []

# Prepare texts for translation, applying chunking
for text in texts:
    section_type, index, para, original_text, modified_text, placeholder_map, needs_trans = text
    if needs_trans:
        token_count = len(modified_text.split())
        if token_count > MAX_TOKENS_PER_BLOCK:
            print(f" Large text segment detected ({token_count} tokens). Splitting into {MAX_TOKENS_PER_BLOCK}-token chunks.")
            chunks = chunk_text_blocks(modified_text, max_tokens=MAX_TOKENS_PER_BLOCK)
            for chunk in chunks:
                to_translate.append(chunk)
                translation_metadata.append((section_type, index, para, original_text, chunk, placeholder_map, True))
        else:
            to_translate.append(modified_text)
            translation_metadata.append((section_type, index, para, original_text, modified_text, placeholder_map, True))
    else:
        translation_metadata.append((section_type, index, para, original_text, modified_text, placeholder_map, False))

# Check and reset GPU memory if needed
check_memory_and_reset(len(to_translate))

# Translate texts
print(f"Translating {len(to_translate)} text segments to {target_lang}...")
translated_texts = translate_batch(to_translate, target_lang, fast_mode)

# Reassemble chunks and process translations
translated_idx = 0
current_text = []
current_metadata = None

for text in translation_metadata:
    section_type, index, para, original_text, modified_text, placeholder_map, needs_trans = text
    if needs_trans:
        token_count = len(modified_text.split())
        if token_count > MAX_TOKENS_PER_BLOCK:
            # Collect all chunks for this segment
            if current_metadata != (section_type, index, para):
                if current_text:
                    # Process previous segment
                    translated_text = " ".join(current_text)
                    translated_text = restore_entities(translated_text, current_metadata[5], current_metadata[3])
                    assign_translated_text(current_metadata[2], translated_text, current_metadata[3], current_metadata[5])
                    current_text = []
                current_metadata = (section_type, index, para, original_text, modified_text, placeholder_map)
            current_text.append(translated_texts[translated_idx])
            translated_idx += 1
        else:
            # Process single segment
            if current_text:
                # Process previous chunked segment
                translated_text = " ".join(current_text)
                translated_text = restore_entities(translated_text, current_metadata[5], current_metadata[3])
                assign_translated_text(current_metadata[2], translated_text, current_metadata[3], current_metadata[5])
                current_text = []
                current_metadata = None
            translated_text = translated_texts[translated_idx]
            translated_text = restore_entities(translated_text, placeholder_map, original_text)
            assign_translated_text(para, translated_text, original_text, placeholder_map)
            translated_idx += 1
    else:
        # Process non-translated segment
        if current_text:
            # Process previous chunked segment
            translated_text = " ".join(current_text)
            translated_text = restore_entities(translated_text, current_metadata[5], current_metadata[3])
            assign_translated_text(current_metadata[2], translated_text, current_metadata[3], current_metadata[5])
            current_text = []
            current_metadata = None
        translated_text = restore_entities(modified_text, placeholder_map, original_text)
        assign_translated_text(para, translated_text, original_text, placeholder_map)

# Process any remaining chunked segment
if current_text:
    translated_text = " ".join(current_text)
    translated_text = restore_entities(translated_text, current_metadata[5], current_metadata[3])
    assign_translated_text(current_metadata[2], translated_text, current_metadata[3], current_metadata[5])

# Save the modified document
doc.save(output_path)

# Reset GPU memory after processing to prevent pile-up
reset_gpu_memory()

def assign_translated_text(para, translated_text, original_text, placeholder_map):
runs = [run for run in para.runs]
if not runs:
if para.text.strip():
run = para.add_run(translated_text)
# Preserve basic formatting from the first run if it exists
if para.runs and len(para.runs) > 1:
run.bold = para.runs[0].bold
run.italic = para.runs[0].italic
run.underline = para.runs[0].underline
return

translated_parts = split_text_by_proportions(translated_text, runs, original_text, placeholder_map)
for run, part in zip(runs, translated_parts):
    run.text = part

Conversion Function

def convert_doc_to_docx(doc_path):
if not doc_path.endswith('.doc'):
return doc_path

base_name = os.path.splitext(doc_path)[0]
docx_path = base_name + '.docx'

try:
    subprocess.run(['libreoffice', '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
    print("LibreOffice is installed.")
except FileNotFoundError:
    print("Installing LibreOffice...")
    try:
        subprocess.run(['apt-get', 'update'], check=True)
        subprocess.run(['apt-get', 'install', '-y', 'libreoffice'], check=True)
        print("LibreOffice installed successfully.")
    except subprocess.CalledProcessError as e:
        print(f"Failed to install LibreOffice: {e}")
        raise Exception("LibreOffice is required for .doc conversion but could not be installed.")

if os.path.exists(docx_path):
    os.remove(docx_path)

try:
    subprocess.run(['libreoffice', '--headless', '--convert-to', 'docx', doc_path, '--outdir', os.path.dirname(doc_path)], check=True)
    if os.path.exists(docx_path):
        return docx_path
    else:
        raise FileNotFoundError(f"Converted file {docx_path} not found after conversion.")
except subprocess.CalledProcessError as e:
    print(f"Conversion failed: {e}")
    raise
except FileNotFoundError as e:
    print(f"{e}")
    raise

Main Execution

if name == "main":
doc_path = "/content/AIMT2002101_English_Quant.docx"
output_path = "/content/translated_output.docx"

print("\n" + "=" * 40)
print("Select target language (Hindi, Tamil, Telugu):")
target_lang = input().strip().capitalize()
if target_lang not in LANGUAGES:
    print(f"Invalid language. Defaulting to Hindi.")
    target_lang = "Hindi"

print("Enter entities to preserve (comma-separated, e.g., 'Name, Place etc') (optional):")
entities = parse_user_entities(input().strip())

print(f"\n Loading document from {doc_path}...")

if doc_path.endswith('.doc'):
    print("Converting .doc to .docx...")
    docx_path = convert_doc_to_docx(doc_path)
    print(f" Converted to {docx_path}")
else:
    docx_path = doc_path

start_time = time.time()
output_path = f"/content/translated_{target_lang.lower()}.docx"
process_document(docx_path, output_path, entities, target_lang)
print(f"\n Translation to {target_lang} completed and saved to {output_path} in {time.time() - start_time:.2f}s")`

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Word Document Element preservation (I know this is not related to PDF's, but there are no other resources for help) #4465

Uh oh!

{{title}}

Uh oh!

Replies: 0 comments

Select a reply

Uh oh!

Word Document Element preservation (I know this is not related to PDF's, but there are no other resources for help) #4465

Uh oh!

Prasaderp Apr 23, 2025

Configuration

Initialize the translation model and tokenizer

Utility Functions

Document Processing Functions

Conversion Function

Main Execution

Replies: 0 comments

Prasaderp
Apr 23, 2025