fonchain-tools/backend/utils/docx_translator.py

244 lines
12 KiB
Python
Raw Normal View History

2024-11-13 10:40:15 +00:00
from docx import Document
import hashlib
import time
import requests
import os
class DocxTranslator:
def __init__(self):
self.appid = "20241112002200806"
self.secret_key = "preM0becByYCdotRTP_a"
self.api_url = "https://api.fanyi.baidu.com/api/trans/vip/translate"
self.max_length = 2000
self.lang_map = {
'en': 'en', # English
'jp': 'jp', # Japanese
'cht': 'cht' # Traditional Chinese
}
def _get_sign(self, text, salt):
sign_str = f"{self.appid}{text}{salt}{self.secret_key}"
return hashlib.md5(sign_str.encode()).hexdigest()
def translate_text(self, text, to_lang):
target_lang = self.lang_map.get(to_lang, to_lang)
if len(text) > self.max_length:
segments = text.split('')
current_segment = ''
translated_segments = []
for segment in segments:
if len(current_segment) + len(segment) < self.max_length:
current_segment += segment + ''
else:
if current_segment:
translated_segments.append(self._translate_segment(current_segment, target_lang))
current_segment = segment + ''
if current_segment:
translated_segments.append(self._translate_segment(current_segment, target_lang))
return ''.join(translated_segments)
else:
return self._translate_segment(text, target_lang)
def _translate_segment(self, text, to_lang):
if not text.strip():
return text
salt = str(int(time.time()))
sign = self._get_sign(text, salt)
params = {
'q': text,
'from': 'zh',
'to': to_lang,
'appid': self.appid,
'salt': salt,
'sign': sign
}
max_retries = 3
for attempt in range(max_retries):
try:
response = requests.get(self.api_url, params=params)
result = response.json()
if 'error_code' in result:
if attempt < max_retries - 1:
time.sleep(1)
continue
raise Exception(f"Translation error: {result['error_msg']}")
return result['trans_result'][0]['dst']
except Exception as e:
if attempt < max_retries - 1:
time.sleep(1)
continue
raise e
def translate_document(self, input_path, target_lang='en', progress_callback=None):
doc = Document(input_path)
output_path = input_path.replace('.docx', f'_translated.docx')
try:
# 计算总翻译项
total_items = len(doc.paragraphs)
for table in doc.tables:
total_items += sum(len(row.cells) for row in table.rows)
current_item = 0
# 翻译段落
for paragraph in doc.paragraphs:
if paragraph.text.strip():
try:
# 保存原始的段落格式和换行
runs = paragraph.runs
original_runs = []
# 收集每个run的文本和格式信息保留原始换行符
for run in runs:
text = run.text
if text: # 不去除空白字符,保留原始格式
original_runs.append({
'text': text,
'bold': run.bold,
'italic': run.italic,
'underline': run.underline,
'font_name': run.font.name,
'font_size': run.font.size,
'color_rgb': run.font.color.rgb if run.font.color else None,
'break_type': 'break' if any(text.endswith(x) for x in ['\n', '\v', '\r']) else None
})
# 清除原有内容但保持段落格式
paragraph.clear()
# 分别翻译和添加每个run的文本
for i, orig_run in enumerate(original_runs):
if orig_run['text']:
# 保留换行符
has_break = orig_run['break_type'] == 'break'
text_to_translate = orig_run['text'].rstrip('\n\r\v')
# 翻译非空文本
if text_to_translate.strip():
translated_text = self.translate_text(text_to_translate, target_lang)
else:
translated_text = text_to_translate
# 创建新的run并应用格式
new_run = paragraph.add_run()
new_run.bold = orig_run['bold']
new_run.italic = orig_run['italic']
new_run.underline = orig_run['underline']
new_run.font.name = orig_run['font_name']
if orig_run['font_size']:
new_run.font.size = orig_run['font_size']
if orig_run['color_rgb']:
new_run.font.color.rgb = orig_run['color_rgb']
# 添加翻译后的文本
new_run.text = translated_text
# 如果原文有换行,添加换行符
if has_break:
new_run.add_break() # 不指定类型,使用默认换行
except Exception as e:
print(f"Error translating paragraph: {e}")
# 保留原文和格式
paragraph.clear()
for run in runs:
new_run = paragraph.add_run(run.text)
new_run.bold = run.bold
new_run.italic = run.italic
new_run.underline = run.underline
new_run.font.name = run.font.name
if run.font.size:
new_run.font.size = run.font.size
if run.font.color and run.font.color.rgb:
new_run.font.color.rgb = run.font.color.rgb
current_item += 1
if progress_callback:
progress_callback(int((current_item / total_items) * 100))
# 翻译表格 (使用相同的逻辑处理表格单元格)
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
if cell.text.strip():
try:
for paragraph in cell.paragraphs:
runs = paragraph.runs
original_runs = []
for run in runs:
text = run.text
if text:
original_runs.append({
'text': text,
'bold': run.bold,
'italic': run.italic,
'underline': run.underline,
'font_name': run.font.name,
'font_size': run.font.size,
'color_rgb': run.font.color.rgb if run.font.color else None,
'break_type': 'break' if any(text.endswith(x) for x in ['\n', '\v', '\r']) else None
})
paragraph.clear()
for i, orig_run in enumerate(original_runs):
if orig_run['text']:
has_break = orig_run['break_type'] == 'break'
text_to_translate = orig_run['text'].rstrip('\n\r\v')
if text_to_translate.strip():
translated_text = self.translate_text(text_to_translate, target_lang)
else:
translated_text = text_to_translate
new_run = paragraph.add_run()
new_run.bold = orig_run['bold']
new_run.italic = orig_run['italic']
new_run.underline = orig_run['underline']
new_run.font.name = orig_run['font_name']
if orig_run['font_size']:
new_run.font.size = orig_run['font_size']
if orig_run['color_rgb']:
new_run.font.color.rgb = orig_run['color_rgb']
new_run.text = translated_text
if has_break:
new_run.add_break()
except Exception as e:
print(f"Error translating cell: {e}")
# 保留原文和格式
for paragraph in cell.paragraphs:
paragraph.clear()
for run in runs:
new_run = paragraph.add_run(run.text)
new_run.bold = run.bold
new_run.italic = run.italic
new_run.underline = run.underline
new_run.font.name = run.font.name
if run.font.size:
new_run.font.size = run.font.size
if run.font.color and run.font.color.rgb:
new_run.font.color.rgb = run.font.color.rgb
current_item += 1
if progress_callback:
progress_callback(int((current_item / total_items) * 100))
doc.save(output_path)
return output_path
finally:
del doc