File size: 1,166 Bytes
038001b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
---
tags:
- dna_bert
---
```
NUM_CLASSES = number of the classes in your data
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
)
tokenizer = AutoTokenizer.from_pretrained(
zhihan1996/DNA_bert_6, do_lower_case=False
)
model = AutoModelForSequenceClassification.from_pretrained(
zhihan1996/DNA_bert_6, num_labels=NUM_CLASSES
)
def return_kmer(seq, K=6):
"""
This function outputs the K-mers of a sequence
Parameters
----------
seq : str
A single sequence to be split into K-mers
K : int, optional
The length of the K-mers, by default 6
Returns
-------
kmer_seq : str
A string of K-mers separated by spaces
"""
kmer_list = []
for x in range(len(seq) - K + 1):
kmer_list.append(seq[x : x + K])
kmer_seq = " ".join(kmer_list)
return kmer_seq
sequence = your DNA sequences
train_kmers = [return_kmer(seq) for seq in sequence]
train_encodings = tokenizer.batch_encode_plus(
train_kmers,
max_length=512, # max len of BERT
padding=True,
truncation=True,
return_attention_mask=True,
return_tensors="pt",
)
``` |