import torch
import pandas as pd
import numpy as np
# for embedding visualization later
import plotly.express as px
import plotly.io as pio
# for VSCode plotly rendering
= "notebook"
pio.renderers.default
# for appearance
= "plotly_white"
pio.templates.default
# for train-test split
from sklearn.model_selection import train_test_split
# for suppressing bugged warnings from torchinfo
import warnings
"ignore", category = UserWarning)
warnings.filterwarnings(
= torch.device("cuda" if torch.cuda.is_available() else "cpu")
device
= "https://raw.githubusercontent.com/PhilChodrow/PIC16B/master/datasets/tcc_ceds_music.csv"
url = pd.read_csv(url) df
= ['dating', 'violence', 'world/life', 'night/time','shake the audience','family/gospel', 'romantic', 'communication','obscene', 'music', 'movement/places', 'light/visual perceptions','family/spiritual', 'like/girls', 'sadness', 'feelings', 'danceability','loudness', 'acousticness', 'instrumentalness', 'valence', 'energy']
engineered_features
"genre").size() df.groupby(
genre
blues 4604
country 5445
hip hop 904
jazz 3845
pop 7042
reggae 2498
rock 4034
dtype: int64
= {
genres "blues": 0,
"country": 1,
"hip hop": 2,
"jazz": 3,
"pop": 4,
"reggae": 5,
"rock": 6
}
"genre"] = df["genre"].apply(genres.get)
df[ df.head()
Unnamed: 0 | artist_name | track_name | release_date | genre | lyrics | len | dating | violence | world/life | ... | sadness | feelings | danceability | loudness | acousticness | instrumentalness | valence | energy | topic | age | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | mukesh | mohabbat bhi jhoothi | 1950 | 4 | hold time feel break feel untrue convince spea... | 95 | 0.000598 | 0.063746 | 0.000598 | ... | 0.380299 | 0.117175 | 0.357739 | 0.454119 | 0.997992 | 0.901822 | 0.339448 | 0.137110 | sadness | 1.0 |
1 | 4 | frankie laine | i believe | 1950 | 4 | believe drop rain fall grow believe darkest ni... | 51 | 0.035537 | 0.096777 | 0.443435 | ... | 0.001284 | 0.001284 | 0.331745 | 0.647540 | 0.954819 | 0.000002 | 0.325021 | 0.263240 | world/life | 1.0 |
2 | 6 | johnnie ray | cry | 1950 | 4 | sweetheart send letter goodbye secret feel bet... | 24 | 0.002770 | 0.002770 | 0.002770 | ... | 0.002770 | 0.225422 | 0.456298 | 0.585288 | 0.840361 | 0.000000 | 0.351814 | 0.139112 | music | 1.0 |
3 | 10 | pérez prado | patricia | 1950 | 4 | kiss lips want stroll charm mambo chacha merin... | 54 | 0.048249 | 0.001548 | 0.001548 | ... | 0.225889 | 0.001548 | 0.686992 | 0.744404 | 0.083935 | 0.199393 | 0.775350 | 0.743736 | romantic | 1.0 |
4 | 12 | giorgos papadopoulos | apopse eida oneiro | 1950 | 4 | till darling till matter know till dream live ... | 48 | 0.001350 | 0.001350 | 0.417772 | ... | 0.068800 | 0.001350 | 0.291671 | 0.646489 | 0.975904 | 0.000246 | 0.597073 | 0.394375 | romantic | 1.0 |
5 rows × 31 columns
from torch.utils.data import Dataset, DataLoader
class TextDataFromDF(Dataset):
def __init__(self, df):
self.df = df
def __getitem__(self, index):
return self.df.iloc[index, 5], self.df.iloc[index, 0]
def __len__(self):
return len(self.df)
= train_test_split(df,shuffle = True, test_size = 0.2)
df_train, df_val = TextDataFromDF(df_train)
train_data = TextDataFromDF(df_val) val_data
194] train_data[
('morning ride think morning ride morning ride nice ride miss morning ride longest ride morning ride morning ride morning ride morning ride slip slide go break slip slide go break matter hide send send morning ride morning ride morning ride morning ride tell ellington work jamaica buerue credit station gemini port portland morning ride morning ride morning ride morning ride',
64271)
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
= get_tokenizer('basic_english')
tokenizer
= tokenizer(train_data[194][0])
tokenized tokenized
OSError: dlopen(/Users/jakegilbert/anaconda3/envs/ml-0451/lib/python3.9/site-packages/torchtext/lib/libtorchtext.so, 0x0006): Symbol not found: __ZN2at4_ops15to_dtype_layout4callERKNS_6TensorENSt3__18optionalIN3c1010ScalarTypeEEENS6_INS7_6LayoutEEENS6_INS7_6DeviceEEENS6_IbEEbbNS6_INS7_12MemoryFormatEEE
Referenced from: <B145C7C7-A04C-3975-B142-8B160ADC1CFF> /Users/jakegilbert/anaconda3/envs/ml-0451/lib/python3.9/site-packages/torchtext/lib/libtorchtext.so
Expected in: <6B754090-A299-3FA1-B21D-A3C9B7051AD1> /Users/jakegilbert/anaconda3/envs/ml-0451/lib/python3.9/site-packages/torch/lib/libtorch_cpu.dylib