# Imports import streamlit as st import numpy as np import torch import random from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling from datasets import Dataset from huggingface_hub import HfApi import plotly.graph_objects as go import time from datetime import datetime # Cyberpunk and Loading Animation Styling def setup_cyberpunk_style(): st.markdown(""" """, unsafe_allow_html=True) # Prepare Dataset Function with Padding Token Fix def prepare_dataset(data, tokenizer, block_size=128): tokenizer.pad_token = tokenizer.eos_token def tokenize_function(examples): return tokenizer(examples['text'], truncation=True, max_length=block_size, padding='max_length') raw_dataset = Dataset.from_dict({'text': data}) tokenized_dataset = raw_dataset.map(tokenize_function, batched=True, remove_columns=['text']) tokenized_dataset = tokenized_dataset.map(lambda examples: {'labels': examples['input_ids']}, batched=True) tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) return tokenized_dataset # Training Dashboard Class with Enhanced Display class TrainingDashboard: def __init__(self): self.metrics = { 'current_loss': 0, 'best_loss': float('inf'), 'generation': 0, 'individual': 0, 'start_time': time.time(), 'training_speed': 0 } self.history = [] def update(self, loss, generation, individual): self.metrics['current_loss'] = loss self.metrics['generation'] = generation self.metrics['individual'] = individual if loss < self.metrics['best_loss']: self.metrics['best_loss'] = loss elapsed_time = time.time() - self.metrics['start_time'] self.metrics['training_speed'] = (generation * individual) / elapsed_time self.history.append({'loss': loss, 'timestamp': datetime.now().strftime('%H:%M:%S')}) # Define Model Initialization def initialize_model(model_name="gpt2"): model = GPT2LMHeadModel.from_pretrained(model_name) tokenizer = GPT2Tokenizer.from_pretrained(model_name) tokenizer.pad_token = tokenizer.eos_token return model, tokenizer # Load Dataset Function def load_dataset(data_source="demo", tokenizer=None): if data_source == "demo": data = ["Sample text data for model training. This can be replaced with actual data for better performance."] else: data = ["Loaded data from uploaded text file."] dataset = prepare_dataset(data, tokenizer) return dataset # Train Model Function with Customized Progress Bar def train_model(model, train_dataset, tokenizer, epochs=3, batch_size=4): training_args = TrainingArguments( output_dir="./results", overwrite_output_dir=True, num_train_epochs=epochs, per_device_train_batch_size=batch_size, save_steps=10_000, save_total_limit=2, logging_dir="./logs", logging_steps=100, ) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, ) trainer.train() # Main App Logic def main(): setup_cyberpunk_style() st.markdown('

Cyberpunk Neural Training Hub

', unsafe_allow_html=True) # Initialize model and tokenizer model, tokenizer = initialize_model() # Sidebar Configuration with Additional Options with st.sidebar: st.markdown("### Configuration Panel") # Hugging Face API Token Input hf_token = st.text_input("Enter your Hugging Face Token", type="password") if hf_token: api = HfApi() api.set_access_token(hf_token) st.success("Hugging Face token added successfully!") # Training Parameters training_epochs = st.slider("Training Epochs", min_value=1, max_value=5, value=3) batch_size = st.slider("Batch Size", min_value=2, max_value=8, value=4) model_choice = st.selectbox("Model Selection", ("gpt2", "distilgpt2", "gpt2-medium")) # Dataset Source Selection data_source = st.selectbox("Data Source", ("demo", "uploaded file")) if data_source == "uploaded file": uploaded_file = st.file_uploader("Upload a text file", type=["txt", "csv"]) custom_learning_rate = st.slider("Learning Rate", min_value=1e-6, max_value=5e-4, value=3e-5, step=1e-6) # Advanced Settings Toggle advanced_toggle = st.checkbox("Advanced Training Settings") if advanced_toggle: warmup_steps = st.slider("Warmup Steps", min_value=0, max_value=500, value=100) weight_decay = st.slider("Weight Decay", min_value=0.0, max_value=0.1, step=0.01, value=0.01) else: warmup_steps = 100 weight_decay = 0.01 # Load Dataset train_dataset = load_dataset(data_source, tokenizer, uploaded_file=uploaded_file) def load_dataset(data_source="demo", tokenizer=None, uploaded_file=None): if data_source == "demo": data = ["Sample text data for model training. This can be replaced with actual data for better performance."] elif uploaded_file is not None: if uploaded_file.name.endswith(".txt"): data = [uploaded_file.read().decode("utf-8")] elif uploaded_file.name.endswith(".csv"): import pandas as pd df = pd.read_csv(uploaded_file) data = df[df.columns[0]].tolist() # assuming first column is text data else: data = ["No file uploaded. Please upload a dataset."] dataset = prepare_dataset(data, tokenizer) return dataset # Start Training with Progress Bar progress_placeholder = st.empty() st.markdown("### Model Training Progress") for epoch in range(training_epochs): train_model(model, train_dataset, tokenizer, epochs=1, batch_size=batch_size) # Update Progress Bar progress = (epoch + 1) / training_epochs * 100 progress_placeholder.markdown(f"""
""", unsafe_allow_html=True) st.success("Training Complete!") if __name__ == "__main__": main()