from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.options import Options from webdriver_manager.chrome import ChromeDriverManager from bs4 import BeautifulSoup from urllib.parse import urljoin, urlparse import time from concurrent.futures import ThreadPoolExecutor, as_completed import threading # Create a lock for thread-safe operations visited_lock = threading.Lock() # Thread-safe set for visited URLs visited = set() # Function to scrape links with depth control def get_all_links(url, max_depth, current_depth=0): if current_depth > max_depth: return [] try: # Print the current URL being scraped print(f"Scraping: {url} at depth {current_depth}") # Set up Chrome options chrome_options = Options() chrome_options.add_argument("--headless") # Run in headless mode # Set up the Chrome driver service = Service(ChromeDriverManager().install()) driver = webdriver.Chrome(service=service, options=chrome_options) # Navigate to the URL driver.get(url) # Wait for the page to load (adjust the sleep time if needed) time.sleep(5) # Get the page source and parse it with BeautifulSoup soup = BeautifulSoup(driver.page_source, 'html.parser') # Find all 'a' tags and extract the 'href' attribute links = set() for a_tag in soup.find_all('a', href=True): href = a_tag['href'] full_url = urljoin(url, href) # Only include links from the same domain and not already visited with visited_lock: if urlparse(full_url).netloc == urlparse(url).netloc and full_url not in visited: visited.add(full_url) links.add(full_url) # Close the browser driver.quit() return list(links) except Exception as e: print(f"Error fetching the URL: {e}") return [] def scrape_recursive(urls, max_depth, current_depth, executor): if current_depth > max_depth: return [] # Submit tasks for the URLs to the ThreadPoolExecutor futures = [executor.submit(get_all_links, url, max_depth, current_depth) for url in urls] all_links = set() for future in as_completed(futures): try: links = future.result() all_links.update(links) except Exception as e: print(f"Error in thread: {e}") # Recursively scrape the new set of links if current_depth + 1 <= max_depth: new_links = scrape_recursive(all_links, max_depth, current_depth + 1, executor) all_links.update(new_links) return all_links def main(): # Get input URL and depth from the user input_url = input("Enter the URL to scrape: ") max_depth = int(input("Enter the maximum depth: ")) # ThreadPoolExecutor for multithreading with ThreadPoolExecutor(max_workers=10) as executor: # Start scraping all_links = scrape_recursive([input_url], max_depth, 0, executor) # Save the results to links.txt with open("links.txt", "w") as file: for link in all_links: file.write(f"{link}\n") print(f"\nFound {len(all_links)} links on the page. Saved to links.txt.") if __name__ == "__main__": main()