import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
tmdbDataSet = pd.read_csv('tmdb_5000_movies.csv')
# Values in the dataset
print(tmdbDataSet.values)
print(type(tmdbDataSet.values))
# Creating dataset out of list
index = ['Name', 'Salary', 'Age']
details = [['Ashok', 'Mike', 'Arun'], [1200, 1400, 2500], [23, 28, 30]]
zippedList = list(zip(index, details))
dictObject = dict(zippedList)
df = pd.DataFrame(dictObject)
print(df)
# Updating column labels
list_labels = ['Age(In Years)', 'Name', 'Salary (In $)']
df.columns = list_labels
print(df)
# Updating column labels at time of importing dataset
tmdbDataSet_rename = pd.read_csv('tmdb_5000_movies.csv', header=0, names=['budget', 'genres', 'home_page', 'id', 'keywords', 'original_language',
'original_title', 'overview', 'popularity', 'production_companies',
'production_countries', 'release_date', 'revenue', 'runtime',
'spoken_languages', 'movie_status', 'tagline', 'title', 'vote_average',
'vote_count'], comment='#')
print(tmdbDataSet_rename.head())
# Exporting dataset ot csv or excel file
tmdbDataSet.to_csv('tmdb_movies.csv', index=False)
tmdbDataSet.to_excel('tmdb_movies.xlsx', index=False)
# Plotting datasets
tmdbDataSet.plot()
plt.show()
# Plot columns on different charts
tmdbDataSet.plot(subplots=True)
plt.show()
# plot selected columns
columnList = ['vote_count','budget']
tmdbDataSet[columnList].plot(subplots=True)
plt.show()
tmdbDataSet.plot(x='production_companies', y=['budget'])
plt.show()
# Scatter and box plot
cols = ['vote_count','budget']
tmdbDataSet[cols].plot(kind='box', subplots=True)
plt.show()
tmdbDataSet.plot(kind='scatter', x='budget', y='vote_count', s=tmdbDataSet.popularity)
plt.show()
# Histogram chart
tmdbDataSet.plot(kind='hist', y='vote_average')
plt.show()
# PDF and CDF
tmdbDataSet.plot(kind='hist', y='vote_average', normed=True)
plt.show()
# Way of plotting two charts in one charts
figure, axes = plt.subplots(nrows=2, ncols=1)
tmdbDataSet.plot(ax=axes[0], kind='hist', y='vote_average')
tmdbDataSet.plot(ax=axes[1], kind='hist', y='vote_average', normed=True,
cumulative=True)
plt.show()
tmdbDataSet_date_index = pd.read_csv('tmdb_5000_movies.csv', index_col='release_date', parse_dates=True)
print(tmdbDataSet_date_index.head())
print(tmdbDataSet_date_index.loc['2010-Aug-01'].head())
print(tmdbDataSet_date_index.loc['2010-01-01 21:00:00':'2010-05-11 22:00:00'].head())
print(pd.to_datetime(['2010-01-01 21:00:00','2010-05-11 22:00:00'], format='%Y-%m-%d %H:%M'))
tmdbDataSet.index = [x * 2 for x in range(0, 4803)]
print(tmdbDataSet.index.name)
tmdbDataSet.index.name = 'movie_index'
print(tmdbDataSet.index.name)
tmdbDataSet_date_index.vote_count.plot()
plt.show()
tmdbDataSet_multi_index = tmdbDataSet.set_index(['release_date', 'status'])
tmdbDataSet_multi_index = tmdbDataSet_multi_index.sort_index()
print(tmdbDataSet_multi_index.loc[('1916-09-04','Released')])
print(tmdbDataSet_multi_index.loc[(['1916-09-04', '2010-03-03'],'Released'), :])
print(tmdbDataSet_multi_index.loc[(slice(None), 'Rumored'), :])
# swapping index of multilevel index
tmdbDataSet_multi_index_swap = tmdbDataSet_multi_index.swaplevel(0,1)
print(tmdbDataSet_multi_index_swap.head())
# Resetting Index
tmdbDataSet_original = tmdbDataSet_multi_index_swap.reset_index()
print(tmdbDataSet_original.head())
# Stacking and Unstcking
print(tmdbDataSet_multi_index_unsatck.tail())
tmdbDataSet_dateIndex = pd.read_csv('tmdb_5000_movies.csv',
parse_dates=True, index_col='release_date')
print(tmdbDataSet_dateIndex['vote_count'].resample('D').count())
# Vote count movies got in August 2009
count = tmdbDataSet_dateIndex['vote_count']['2009-Aug']
print(count.resample('D').max())
# Down sampling
print(tmdbDataSet_date_index.resample('A').sum())
# Up sampling
print(tmdbDataSet_date_index.resample('H').sum())
print(tmdbDataSet_date_index.resample('4H').sum())
print(tmdbDataSet_date_index.resample('A').sum().count())
print(tmdbDataSet.groupby('release_date').count())
# Sum of budget made on each day by 5000 movies
print(tmdbDataSet.groupby('release_date')['budget'].sum())
# Grouping by the mutiple columns
print(tmdbDataSet.groupby(['release_date', 'runtime'])[['popularity', 'budget']].sum())
# Multiple aggregations
print(tmdbDataSet.groupby(['release_date', 'runtime'])[['popularity', 'budget']].agg(['sum', 'count']))
print(tmdbDataSet.vote_average.floordiv(2))
print(tmdbDataSet.status.str.upper())
tmdbDataSet_dropped = tmdbDataSet.drop(['production_countries'], axis='columns')
print(tmdbDataSet_dropped.head())
Here we will try to find which movie is most popular based on the tmdb data that we have.
# Popularity of movie depends on vote_average, revenue - budget, popularity
tmdbDataSet_date_index['profit'] = tmdbDataSet_date_index['revenue'
] - tmdbDataSet_date_index['budget']
tmdbDataSet_date_index_grouped = tmdbDataSet_date_index.groupby('title')
tmdbDataSet_date_index_grouped_sub = tmdbDataSet_date_index_grouped[['vote_average',
'profit', 'popularity']]
# Max and min value in columns
agg_results = tmdbDataSet_date_index_grouped_sub.agg(['max', 'min'])
print(agg_results)
# Extract year of release_date and set release_date column as index
tmdbDataSet_agg = tmdbDataSet.copy()
tmdbDataSet_agg['year'] = pd.to_datetime(tmdbDataSet_agg['release_date']).dt.year
tmdbDataSet_agg['year']= tmdbDataSet_agg['year'].fillna(0.0).astype(int)
tmdbDataSet_agg.set_index('year', inplace=True)
tmdbDataSet_agg['profit'] = tmdbDataSet_agg['revenue'
] - tmdbDataSet_agg['budget']
def countMovies(series):
return series.count()
aggObject = {'profit':'sum', 'title': countMovies }
tmdbDataSet_grouped = tmdbDataSet_agg.groupby(['year']).agg(aggObject)
tmdbDataSet_grouped = tmdbDataSet_grouped.drop([0], axis='rows')
tmdbDataSet_grouped['y'] = list(tmdbDataSet_grouped.index)
tmdbDataSet_grouped.plot(kind='scatter', y='y', x='profit',
s=tmdbDataSet_grouped.title)
tmdbDataSet_grouped.head()
grouped_dataset = tmdbDataSet.groupby(['original_language'])
def find_en_lang(series):
if series.values[0] == 'en':
return 'english'
else:
return 'non-english'
## Transforming data
tmdbDataSet.loc[:,'original_language_en'] = grouped_dataset.original_language.transform(
find_en_lang)
display(tmdbDataSet.head())
## Converting runtime to hours
tmdbDataSet.loc[:,'runtime_hours'] = tmdbDataSet.apply(lambda x:x['runtime']/60, axis=1)
display(tmdbDataSet.head())
print(grouped_dataset['budget'].sum())
# Movies whose budget is greater than 25cr
display(grouped_dataset.filter(lambda x: x['budget'].sum() > 250000000))
language = (tmdbDataSet['original_language'] == 'en').map({True:'English',
False:'Non-English'})
display(tmdbDataSet.groupby(language)['budget', 'original_language'].mean())