File size: 2,441 Bytes
4b8361a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0030bc6
 
4b8361a
0030bc6
4b8361a
0030bc6
4b8361a
 
 
 
 
 
 
 
0030bc6
 
4b8361a
 
 
0030bc6
4b8361a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import os
import json
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm import tqdm
from utils import download_song
import time

def set_env():
    here = os.path.dirname(__file__)
    with open(os.path.join(here, "auth", "spotify.json"), "r") as f:
        config = json.load(f)
    os.environ["SPOTIPY_CLIENT_ID"] = config["client_id"]
    os.environ["SPOTIPY_CLIENT_SECRET"] = config["client_secret"]
    os.environ["SPOTIPY_REDIRECT_URI"] = "https://localhost:8080/callback"

set_env()


def get_song_preview_url(song_name:str, spotify:spotipy.Spotify, artist:str = None) -> str | None:
    info = {
        "track": song_name
    }
    if artist is not None:
        info["artist"] = artist
    query = " ".join(f"{k}: {v}" for k,v in info.items())
    results = spotify.search(query,type="track", limit=1)["tracks"]["items"]
    valid_results = len(results) > 0 and results[0] is not None and "preview_url" in results[0]
    if not valid_results:
        return None
    song = results[0]
    return song["preview_url"]

def patch_missing_songs(
    df: pd.DataFrame,
) -> pd.DataFrame:
    spotify = spotipy.Spotify(auth_manager=SpotifyClientCredentials())
    # find songs with missing previews
    audio_urls = df["Sample"].replace(".", np.nan)
    missing_audio = pd.isna(audio_urls)
    missing_df = df[missing_audio]
    def patch_preview(row: pd.Series):
        song:str = row["Title"]
        artist:str = row["Artist"]
        preview_url = get_song_preview_url(song, spotify, artist)
        if preview_url is not None:
            row["Sample"] = preview_url
        return row
    rows = []
    indices = []
    after = 18418
    missing_df = missing_df.iloc[after:]
    total_rows = len(missing_df)
    for i, row in tqdm(missing_df.iterrows(),total=total_rows):
        patched_row = patch_preview(row)
        rows.append(patched_row)
        indices.append(i)


    patched_df = pd.DataFrame(rows,index=indices)
    df.update(patched_df)
    return df


def download_links_from_backup(backup_file:str, output_dir:str):
    with open(backup_file) as f:
        links = [x.split(",")[1].strip() for x in f.readlines()]
    links = [l for l in links if "https" in l]
    for link in tqdm(links, "Songs Downloaded"):
        download_song(link, output_dir)
        time.sleep(5e-3) # hopefully wont be rate limited with delay 🤞