DNA_bert_6 / README.md
moeinh77's picture
the calling card for how to use dna_bert with huggingface API
038001b
|
raw
history blame
No virus
1.17 kB
metadata
tags:
  - dna_bert
NUM_CLASSES = number of the classes in your data

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
)
tokenizer = AutoTokenizer.from_pretrained(
   zhihan1996/DNA_bert_6, do_lower_case=False
)

model = AutoModelForSequenceClassification.from_pretrained(
     zhihan1996/DNA_bert_6, num_labels=NUM_CLASSES
)

def return_kmer(seq, K=6):
    """
    This function outputs the K-mers of a sequence
    Parameters
    ----------
    seq : str
        A single sequence to be split into K-mers
    K : int, optional
        The length of the K-mers, by default 6
    Returns
    -------
    kmer_seq : str
        A string of K-mers separated by spaces
    """

    kmer_list = []
    for x in range(len(seq) - K + 1):
        kmer_list.append(seq[x : x + K])

    kmer_seq = " ".join(kmer_list)
    return kmer_seq

sequence = your DNA sequences 

train_kmers = [return_kmer(seq) for seq in sequence]

train_encodings = tokenizer.batch_encode_plus(
    train_kmers,
    max_length=512,  # max len of BERT
    padding=True,
    truncation=True,
    return_attention_mask=True,
    return_tensors="pt",
)