244 lines
12 KiB
Python
244 lines
12 KiB
Python
from docx import Document
|
||
import hashlib
|
||
import time
|
||
import requests
|
||
import os
|
||
|
||
class DocxTranslator:
|
||
def __init__(self):
|
||
self.appid = "20241112002200806"
|
||
self.secret_key = "preM0becByYCdotRTP_a"
|
||
self.api_url = "https://api.fanyi.baidu.com/api/trans/vip/translate"
|
||
self.max_length = 2000
|
||
self.lang_map = {
|
||
'en': 'en', # English
|
||
'jp': 'jp', # Japanese
|
||
'cht': 'cht' # Traditional Chinese
|
||
}
|
||
|
||
def _get_sign(self, text, salt):
|
||
sign_str = f"{self.appid}{text}{salt}{self.secret_key}"
|
||
return hashlib.md5(sign_str.encode()).hexdigest()
|
||
|
||
def translate_text(self, text, to_lang):
|
||
target_lang = self.lang_map.get(to_lang, to_lang)
|
||
|
||
if len(text) > self.max_length:
|
||
segments = text.split('。')
|
||
current_segment = ''
|
||
translated_segments = []
|
||
|
||
for segment in segments:
|
||
if len(current_segment) + len(segment) < self.max_length:
|
||
current_segment += segment + '。'
|
||
else:
|
||
if current_segment:
|
||
translated_segments.append(self._translate_segment(current_segment, target_lang))
|
||
current_segment = segment + '。'
|
||
|
||
if current_segment:
|
||
translated_segments.append(self._translate_segment(current_segment, target_lang))
|
||
|
||
return ''.join(translated_segments)
|
||
else:
|
||
return self._translate_segment(text, target_lang)
|
||
|
||
def _translate_segment(self, text, to_lang):
|
||
if not text.strip():
|
||
return text
|
||
|
||
salt = str(int(time.time()))
|
||
sign = self._get_sign(text, salt)
|
||
|
||
params = {
|
||
'q': text,
|
||
'from': 'zh',
|
||
'to': to_lang,
|
||
'appid': self.appid,
|
||
'salt': salt,
|
||
'sign': sign
|
||
}
|
||
|
||
max_retries = 3
|
||
for attempt in range(max_retries):
|
||
try:
|
||
response = requests.get(self.api_url, params=params)
|
||
result = response.json()
|
||
|
||
if 'error_code' in result:
|
||
if attempt < max_retries - 1:
|
||
time.sleep(1)
|
||
continue
|
||
raise Exception(f"Translation error: {result['error_msg']}")
|
||
|
||
return result['trans_result'][0]['dst']
|
||
|
||
except Exception as e:
|
||
if attempt < max_retries - 1:
|
||
time.sleep(1)
|
||
continue
|
||
raise e
|
||
|
||
def translate_document(self, input_path, target_lang='en', progress_callback=None):
|
||
doc = Document(input_path)
|
||
output_path = input_path.replace('.docx', f'_translated.docx')
|
||
|
||
try:
|
||
# 计算总翻译项
|
||
total_items = len(doc.paragraphs)
|
||
for table in doc.tables:
|
||
total_items += sum(len(row.cells) for row in table.rows)
|
||
|
||
current_item = 0
|
||
|
||
# 翻译段落
|
||
for paragraph in doc.paragraphs:
|
||
if paragraph.text.strip():
|
||
try:
|
||
# 保存原始的段落格式和换行
|
||
runs = paragraph.runs
|
||
original_runs = []
|
||
|
||
# 收集每个run的文本和格式信息,保留原始换行符
|
||
for run in runs:
|
||
text = run.text
|
||
if text: # 不去除空白字符,保留原始格式
|
||
original_runs.append({
|
||
'text': text,
|
||
'bold': run.bold,
|
||
'italic': run.italic,
|
||
'underline': run.underline,
|
||
'font_name': run.font.name,
|
||
'font_size': run.font.size,
|
||
'color_rgb': run.font.color.rgb if run.font.color else None,
|
||
'break_type': 'break' if any(text.endswith(x) for x in ['\n', '\v', '\r']) else None
|
||
})
|
||
|
||
# 清除原有内容但保持段落格式
|
||
paragraph.clear()
|
||
|
||
# 分别翻译和添加每个run的文本
|
||
for i, orig_run in enumerate(original_runs):
|
||
if orig_run['text']:
|
||
# 保留换行符
|
||
has_break = orig_run['break_type'] == 'break'
|
||
text_to_translate = orig_run['text'].rstrip('\n\r\v')
|
||
|
||
# 翻译非空文本
|
||
if text_to_translate.strip():
|
||
translated_text = self.translate_text(text_to_translate, target_lang)
|
||
else:
|
||
translated_text = text_to_translate
|
||
|
||
# 创建新的run并应用格式
|
||
new_run = paragraph.add_run()
|
||
new_run.bold = orig_run['bold']
|
||
new_run.italic = orig_run['italic']
|
||
new_run.underline = orig_run['underline']
|
||
new_run.font.name = orig_run['font_name']
|
||
if orig_run['font_size']:
|
||
new_run.font.size = orig_run['font_size']
|
||
if orig_run['color_rgb']:
|
||
new_run.font.color.rgb = orig_run['color_rgb']
|
||
|
||
# 添加翻译后的文本
|
||
new_run.text = translated_text
|
||
|
||
# 如果原文有换行,添加换行符
|
||
if has_break:
|
||
new_run.add_break() # 不指定类型,使用默认换行
|
||
|
||
except Exception as e:
|
||
print(f"Error translating paragraph: {e}")
|
||
# 保留原文和格式
|
||
paragraph.clear()
|
||
for run in runs:
|
||
new_run = paragraph.add_run(run.text)
|
||
new_run.bold = run.bold
|
||
new_run.italic = run.italic
|
||
new_run.underline = run.underline
|
||
new_run.font.name = run.font.name
|
||
if run.font.size:
|
||
new_run.font.size = run.font.size
|
||
if run.font.color and run.font.color.rgb:
|
||
new_run.font.color.rgb = run.font.color.rgb
|
||
|
||
current_item += 1
|
||
if progress_callback:
|
||
progress_callback(int((current_item / total_items) * 100))
|
||
|
||
# 翻译表格 (使用相同的逻辑处理表格单元格)
|
||
for table in doc.tables:
|
||
for row in table.rows:
|
||
for cell in row.cells:
|
||
if cell.text.strip():
|
||
try:
|
||
for paragraph in cell.paragraphs:
|
||
runs = paragraph.runs
|
||
original_runs = []
|
||
|
||
for run in runs:
|
||
text = run.text
|
||
if text:
|
||
original_runs.append({
|
||
'text': text,
|
||
'bold': run.bold,
|
||
'italic': run.italic,
|
||
'underline': run.underline,
|
||
'font_name': run.font.name,
|
||
'font_size': run.font.size,
|
||
'color_rgb': run.font.color.rgb if run.font.color else None,
|
||
'break_type': 'break' if any(text.endswith(x) for x in ['\n', '\v', '\r']) else None
|
||
})
|
||
|
||
paragraph.clear()
|
||
|
||
for i, orig_run in enumerate(original_runs):
|
||
if orig_run['text']:
|
||
has_break = orig_run['break_type'] == 'break'
|
||
text_to_translate = orig_run['text'].rstrip('\n\r\v')
|
||
|
||
if text_to_translate.strip():
|
||
translated_text = self.translate_text(text_to_translate, target_lang)
|
||
else:
|
||
translated_text = text_to_translate
|
||
|
||
new_run = paragraph.add_run()
|
||
new_run.bold = orig_run['bold']
|
||
new_run.italic = orig_run['italic']
|
||
new_run.underline = orig_run['underline']
|
||
new_run.font.name = orig_run['font_name']
|
||
if orig_run['font_size']:
|
||
new_run.font.size = orig_run['font_size']
|
||
if orig_run['color_rgb']:
|
||
new_run.font.color.rgb = orig_run['color_rgb']
|
||
|
||
new_run.text = translated_text
|
||
|
||
if has_break:
|
||
new_run.add_break()
|
||
|
||
except Exception as e:
|
||
print(f"Error translating cell: {e}")
|
||
# 保留原文和格式
|
||
for paragraph in cell.paragraphs:
|
||
paragraph.clear()
|
||
for run in runs:
|
||
new_run = paragraph.add_run(run.text)
|
||
new_run.bold = run.bold
|
||
new_run.italic = run.italic
|
||
new_run.underline = run.underline
|
||
new_run.font.name = run.font.name
|
||
if run.font.size:
|
||
new_run.font.size = run.font.size
|
||
if run.font.color and run.font.color.rgb:
|
||
new_run.font.color.rgb = run.font.color.rgb
|
||
|
||
current_item += 1
|
||
if progress_callback:
|
||
progress_callback(int((current_item / total_items) * 100))
|
||
|
||
doc.save(output_path)
|
||
return output_path
|
||
finally:
|
||
del doc |