fonchain-tools/backend/utils/docx_translator.py
2024-11-13 18:40:15 +08:00

244 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from docx import Document
import hashlib
import time
import requests
import os
class DocxTranslator:
def __init__(self):
self.appid = "20241112002200806"
self.secret_key = "preM0becByYCdotRTP_a"
self.api_url = "https://api.fanyi.baidu.com/api/trans/vip/translate"
self.max_length = 2000
self.lang_map = {
'en': 'en', # English
'jp': 'jp', # Japanese
'cht': 'cht' # Traditional Chinese
}
def _get_sign(self, text, salt):
sign_str = f"{self.appid}{text}{salt}{self.secret_key}"
return hashlib.md5(sign_str.encode()).hexdigest()
def translate_text(self, text, to_lang):
target_lang = self.lang_map.get(to_lang, to_lang)
if len(text) > self.max_length:
segments = text.split('')
current_segment = ''
translated_segments = []
for segment in segments:
if len(current_segment) + len(segment) < self.max_length:
current_segment += segment + ''
else:
if current_segment:
translated_segments.append(self._translate_segment(current_segment, target_lang))
current_segment = segment + ''
if current_segment:
translated_segments.append(self._translate_segment(current_segment, target_lang))
return ''.join(translated_segments)
else:
return self._translate_segment(text, target_lang)
def _translate_segment(self, text, to_lang):
if not text.strip():
return text
salt = str(int(time.time()))
sign = self._get_sign(text, salt)
params = {
'q': text,
'from': 'zh',
'to': to_lang,
'appid': self.appid,
'salt': salt,
'sign': sign
}
max_retries = 3
for attempt in range(max_retries):
try:
response = requests.get(self.api_url, params=params)
result = response.json()
if 'error_code' in result:
if attempt < max_retries - 1:
time.sleep(1)
continue
raise Exception(f"Translation error: {result['error_msg']}")
return result['trans_result'][0]['dst']
except Exception as e:
if attempt < max_retries - 1:
time.sleep(1)
continue
raise e
def translate_document(self, input_path, target_lang='en', progress_callback=None):
doc = Document(input_path)
output_path = input_path.replace('.docx', f'_translated.docx')
try:
# 计算总翻译项
total_items = len(doc.paragraphs)
for table in doc.tables:
total_items += sum(len(row.cells) for row in table.rows)
current_item = 0
# 翻译段落
for paragraph in doc.paragraphs:
if paragraph.text.strip():
try:
# 保存原始的段落格式和换行
runs = paragraph.runs
original_runs = []
# 收集每个run的文本和格式信息保留原始换行符
for run in runs:
text = run.text
if text: # 不去除空白字符,保留原始格式
original_runs.append({
'text': text,
'bold': run.bold,
'italic': run.italic,
'underline': run.underline,
'font_name': run.font.name,
'font_size': run.font.size,
'color_rgb': run.font.color.rgb if run.font.color else None,
'break_type': 'break' if any(text.endswith(x) for x in ['\n', '\v', '\r']) else None
})
# 清除原有内容但保持段落格式
paragraph.clear()
# 分别翻译和添加每个run的文本
for i, orig_run in enumerate(original_runs):
if orig_run['text']:
# 保留换行符
has_break = orig_run['break_type'] == 'break'
text_to_translate = orig_run['text'].rstrip('\n\r\v')
# 翻译非空文本
if text_to_translate.strip():
translated_text = self.translate_text(text_to_translate, target_lang)
else:
translated_text = text_to_translate
# 创建新的run并应用格式
new_run = paragraph.add_run()
new_run.bold = orig_run['bold']
new_run.italic = orig_run['italic']
new_run.underline = orig_run['underline']
new_run.font.name = orig_run['font_name']
if orig_run['font_size']:
new_run.font.size = orig_run['font_size']
if orig_run['color_rgb']:
new_run.font.color.rgb = orig_run['color_rgb']
# 添加翻译后的文本
new_run.text = translated_text
# 如果原文有换行,添加换行符
if has_break:
new_run.add_break() # 不指定类型,使用默认换行
except Exception as e:
print(f"Error translating paragraph: {e}")
# 保留原文和格式
paragraph.clear()
for run in runs:
new_run = paragraph.add_run(run.text)
new_run.bold = run.bold
new_run.italic = run.italic
new_run.underline = run.underline
new_run.font.name = run.font.name
if run.font.size:
new_run.font.size = run.font.size
if run.font.color and run.font.color.rgb:
new_run.font.color.rgb = run.font.color.rgb
current_item += 1
if progress_callback:
progress_callback(int((current_item / total_items) * 100))
# 翻译表格 (使用相同的逻辑处理表格单元格)
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
if cell.text.strip():
try:
for paragraph in cell.paragraphs:
runs = paragraph.runs
original_runs = []
for run in runs:
text = run.text
if text:
original_runs.append({
'text': text,
'bold': run.bold,
'italic': run.italic,
'underline': run.underline,
'font_name': run.font.name,
'font_size': run.font.size,
'color_rgb': run.font.color.rgb if run.font.color else None,
'break_type': 'break' if any(text.endswith(x) for x in ['\n', '\v', '\r']) else None
})
paragraph.clear()
for i, orig_run in enumerate(original_runs):
if orig_run['text']:
has_break = orig_run['break_type'] == 'break'
text_to_translate = orig_run['text'].rstrip('\n\r\v')
if text_to_translate.strip():
translated_text = self.translate_text(text_to_translate, target_lang)
else:
translated_text = text_to_translate
new_run = paragraph.add_run()
new_run.bold = orig_run['bold']
new_run.italic = orig_run['italic']
new_run.underline = orig_run['underline']
new_run.font.name = orig_run['font_name']
if orig_run['font_size']:
new_run.font.size = orig_run['font_size']
if orig_run['color_rgb']:
new_run.font.color.rgb = orig_run['color_rgb']
new_run.text = translated_text
if has_break:
new_run.add_break()
except Exception as e:
print(f"Error translating cell: {e}")
# 保留原文和格式
for paragraph in cell.paragraphs:
paragraph.clear()
for run in runs:
new_run = paragraph.add_run(run.text)
new_run.bold = run.bold
new_run.italic = run.italic
new_run.underline = run.underline
new_run.font.name = run.font.name
if run.font.size:
new_run.font.size = run.font.size
if run.font.color and run.font.color.rgb:
new_run.font.color.rgb = run.font.color.rgb
current_item += 1
if progress_callback:
progress_callback(int((current_item / total_items) * 100))
doc.save(output_path)
return output_path
finally:
del doc