Spaces:
Sleeping
Sleeping
from selenium import webdriver | |
from selenium.webdriver.chrome.service import Service | |
from selenium.webdriver.chrome.options import Options | |
from webdriver_manager.chrome import ChromeDriverManager | |
from bs4 import BeautifulSoup | |
from urllib.parse import urljoin, urlparse | |
import time | |
from concurrent.futures import ThreadPoolExecutor, as_completed | |
import threading | |
# Create a lock for thread-safe operations | |
visited_lock = threading.Lock() | |
# Thread-safe set for visited URLs | |
visited = set() | |
# Function to scrape links with depth control | |
def get_all_links(url, max_depth, current_depth=0): | |
if current_depth > max_depth: | |
return [] | |
try: | |
# Print the current URL being scraped | |
print(f"Scraping: {url} at depth {current_depth}") | |
# Set up Chrome options | |
chrome_options = Options() | |
chrome_options.add_argument("--headless") # Run in headless mode | |
# Set up the Chrome driver | |
service = Service(ChromeDriverManager().install()) | |
driver = webdriver.Chrome(service=service, options=chrome_options) | |
# Navigate to the URL | |
driver.get(url) | |
# Wait for the page to load (adjust the sleep time if needed) | |
time.sleep(5) | |
# Get the page source and parse it with BeautifulSoup | |
soup = BeautifulSoup(driver.page_source, 'html.parser') | |
# Find all 'a' tags and extract the 'href' attribute | |
links = set() | |
for a_tag in soup.find_all('a', href=True): | |
href = a_tag['href'] | |
full_url = urljoin(url, href) | |
# Only include links from the same domain and not already visited | |
with visited_lock: | |
if urlparse(full_url).netloc == urlparse(url).netloc and full_url not in visited: | |
visited.add(full_url) | |
links.add(full_url) | |
# Close the browser | |
driver.quit() | |
return list(links) | |
except Exception as e: | |
print(f"Error fetching the URL: {e}") | |
return [] | |
def scrape_recursive(urls, max_depth, current_depth, executor): | |
if current_depth > max_depth: | |
return [] | |
# Submit tasks for the URLs to the ThreadPoolExecutor | |
futures = [executor.submit(get_all_links, url, max_depth, current_depth) for url in urls] | |
all_links = set() | |
for future in as_completed(futures): | |
try: | |
links = future.result() | |
all_links.update(links) | |
except Exception as e: | |
print(f"Error in thread: {e}") | |
# Recursively scrape the new set of links | |
if current_depth + 1 <= max_depth: | |
new_links = scrape_recursive(all_links, max_depth, current_depth + 1, executor) | |
all_links.update(new_links) | |
return all_links | |
def main(): | |
# Get input URL and depth from the user | |
input_url = input("Enter the URL to scrape: ") | |
max_depth = int(input("Enter the maximum depth: ")) | |
# ThreadPoolExecutor for multithreading | |
with ThreadPoolExecutor(max_workers=10) as executor: | |
# Start scraping | |
all_links = scrape_recursive([input_url], max_depth, 0, executor) | |
# Save the results to links.txt | |
with open("links.txt", "w") as file: | |
for link in all_links: | |
file.write(f"{link}\n") | |
print(f"\nFound {len(all_links)} links on the page. Saved to links.txt.") | |
if __name__ == "__main__": | |
main() | |