项目视频讲解:
基于lstm+taransforner机器翻译-中藏翻译_哔哩哔哩_bilibili
数据展示:
# coding:utf-8
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# 示例数据(重复数据以增加样本量)
# data = [ ... ] # 原始示例数据,已注释掉
# 提取源语言和目标语言句子
data=[]
with open('测试集.txt','r',encoding='utf-8') as f:
f = f.read().splitlines()
for line in f:
temp=[]
temp.append(line.split('%%')[0])
temp.append(line.split('%%')[1][:-2])
data.append(temp)
print(data)
source_texts = [pair[0] for pair in data]
target_texts = [pair[1] for pair in data]
# 训练和测试集拆分
source_train, source_test, target_train, target_test = train_test_split(source_texts, target_texts, test_size=0.2, random_state=42)
# 对源语言进行tokenizer
source_tokenizer = Tokenizer()
source_tokenizer.fit_on_texts(source_train)
source_train_seq = source_tokenizer.texts_to_sequences(source_train)
source_test_seq = source_tokenizer.texts_to_sequences(source_test)
# 对目标语言进行tokenizer
target_tokenizer = Tokenizer()
target_tokenizer.fit_on_texts(target_train)
target_train_seq = target_tokenizer.texts_to_sequences(target_train)
target_test_seq = target_tokenizer.texts_to_sequences(target_test)
# 填充序列
max_source_length = max(len(seq) for seq in source_train_seq)
max_target_length = max(len(seq) for seq in target_train_seq)
source_train_seq = pad_sequences(source_train_seq, maxlen=max_source_length, padding='post')
print('source_train_seq.shape',source_train_seq.shape)
source_test_seq = pad_sequences(source_test_seq, maxlen=max_source_length, padding='post')
print('source_train_seq.shape',source_train_seq.shape)
target_train_seq = pad_sequences(target_train_seq, maxlen=max_target_length, padding='post')
target_test_seq = pad_sequences(target_test_seq, maxlen=max_target_length, padding='post')
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, TimeDistributed, Bidirectional,