|
- prefix_map = {
- "一、": "I.",
- "二、": "II.",
- "三、": "III.",
- "四、": "IV.",
- "五、": "V.",
- "六、": "VI.",
- "七、": "VII.",
- "八、": "VIII.",
- "九、": "IX.",
- "十、": "X.",
- "(一)": "1.",
- "(二)": "2.",
- "(三)": "3.",
- "(四)": "4.",
- "(五)": "5.",
- "(六)": "6.",
- "(七)": "7.",
- "(八)": "8.",
- "(九)": "9.",
- "(十)": "10.",
- "1.": "(1)",
- "2.": "(2)",
- "3.": "(3)",
- "4.": "(4)",
- "5.": "(5)",
- "6.": "(6)",
- "7.": "(7)",
- "8.": "(8)",
- "9.": "(9)",
- "10.": "(10)",
- "结束语": "Conclusion",
- "前言": "Preface",
- }
-
- zh = []
- en = []
-
- with open("raw.zh", encoding="utf-8") as f:
- for line in f.readlines():
- line = line.strip()
- if not line:
- continue
- zh.append(line)
-
- with open("raw.en", encoding="utf-8") as f:
- for line in f.readlines():
- line = line.strip()
- if not line:
- continue
- en.append(line)
-
-
- def get_title(text, lang):
- if lang == "zh":
- for k, v in prefix_map.items():
- if text.startswith(k):
- return k
- return None
- elif lang == "en":
- for k, v in prefix_map.items():
- if text.startswith(v):
- return k
- return None
-
-
- res_zh = []
- res_en = []
-
-
- cursor_zh = 0
- cursor_en = 0
-
- while cursor_zh < len(zh) and get_title(zh[cursor_zh], "zh"):
- if not get_title(zh[cursor_zh], "zh") == get_title(en[cursor_en], "en"):
- res_zh.append('-'*50)
- res_zh.extend(zh[cursor_zh: len(zh)-1])
- res_zh.append('*'*50)
-
- res_en.append('-'*50)
- res_en.extend(en[cursor_en: len(en)-1])
- res_en.append('*'*50)
- break
- i = 1
- j = 1
- while cursor_zh+i < len(zh) and not get_title(zh[cursor_zh+i], "zh"):
- i += 1
- while cursor_en+j < len(en) and not get_title(en[cursor_en+j], "en"):
- j += 1
- if i == j and get_title(zh[cursor_zh], "zh") == get_title(en[cursor_en], "en"):
- res_zh.extend(zh[cursor_zh: cursor_zh+i])
- res_en.extend(en[cursor_en: cursor_en+j])
- else:
- res_zh.append('-'*50)
- res_zh.extend(zh[cursor_zh: cursor_zh+i])
- res_zh.append('*'*50)
-
- res_en.append('-'*50)
- res_en.extend(en[cursor_en: cursor_en+j])
- res_en.append('*'*50)
-
- cursor_zh += i
- cursor_en += j
-
- with open("preprocess.zh", 'w', encoding="utf-8") as f:
- for line in res_zh:
- f.write(line)
- f.write('\n')
- f.write('\n')
-
- with open("preprocess.en", 'w', encoding="utf-8") as f:
- for line in res_en:
- f.write(line)
- f.write('\n')
- f.write('\n')
|