def __getitem__(self, idx):
row = self.df.iloc[idx] # 获取第idx行数据
seqs = [self.tokenize_and_encode(row[col]) for col in self.columns]
# 获取目标值并转换为张量(仅在非测试集模式下)
target = torch.tensor(row['mRNA_remaining_pct'], dtype=torch.float)
def tokenize_and_encode(self, seq):
tokens = seq.split() # 按空格分词
tokens = self.tokenizer.tokenize(seq) # 使用分词器分词
# 将token转换为索引,未知token使用0(<pad>)
encoded = [self.vocab.stoi.get(token, 0) for token in tokens]
padded = encoded + [0] * (self.max_len - len(encoded))
return torch.tensor(padded[:self.max_len], dtype=torch.long)