import pandas as pd
import matplotlib.pyplot as plt
import numpy as np


tmdbDataSet = pd.read_csv('tmdb_5000_movies.csv')


# Values in the dataset
print(tmdbDataSet.values)
print(type(tmdbDataSet.values))


# Creating dataset out of list
index = ['Name', 'Salary', 'Age']
details = [['Ashok', 'Mike', 'Arun'], [1200, 1400, 2500], [23, 28, 30]]
zippedList = list(zip(index, details))
dictObject = dict(zippedList)
df = pd.DataFrame(dictObject)
print(df)


# Updating column labels
list_labels = ['Age(In Years)', 'Name', 'Salary (In $)']
df.columns = list_labels
print(df)


# Updating column labels at time of importing dataset
tmdbDataSet_rename = pd.read_csv('tmdb_5000_movies.csv', header=0, names=['budget', 'genres', 'home_page', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'movie_status', 'tagline', 'title', 'vote_average',
       'vote_count'], comment='#')
print(tmdbDataSet_rename.head())


# Exporting dataset ot csv or excel file
tmdbDataSet.to_csv('tmdb_movies.csv', index=False)

tmdbDataSet.to_excel('tmdb_movies.xlsx', index=False)


# Plotting datasets
tmdbDataSet.plot()
plt.show()


# Plot columns on different charts
tmdbDataSet.plot(subplots=True)
plt.show()


# plot selected columns
columnList = ['vote_count','budget']
tmdbDataSet[columnList].plot(subplots=True)
plt.show()


tmdbDataSet.plot(x='production_companies', y=['budget'])
plt.show()


# Scatter and box plot
cols = ['vote_count','budget']

tmdbDataSet[cols].plot(kind='box', subplots=True)

plt.show()

tmdbDataSet.plot(kind='scatter', x='budget', y='vote_count', s=tmdbDataSet.popularity)
plt.show()


# Histogram chart
tmdbDataSet.plot(kind='hist', y='vote_average')
plt.show()


# PDF and CDF
tmdbDataSet.plot(kind='hist', y='vote_average', normed=True)
plt.show()


# Way of plotting two charts in one charts
figure, axes = plt.subplots(nrows=2, ncols=1)

tmdbDataSet.plot(ax=axes[0], kind='hist', y='vote_average')
tmdbDataSet.plot(ax=axes[1], kind='hist', y='vote_average', normed=True, 
cumulative=True)

plt.show()


tmdbDataSet_date_index = pd.read_csv('tmdb_5000_movies.csv', index_col='release_date', parse_dates=True)

print(tmdbDataSet_date_index.head())
print(tmdbDataSet_date_index.loc['2010-Aug-01'].head())
print(tmdbDataSet_date_index.loc['2010-01-01 21:00:00':'2010-05-11 22:00:00'].head())


print(pd.to_datetime(['2010-01-01 21:00:00','2010-05-11 22:00:00'], format='%Y-%m-%d %H:%M'))


tmdbDataSet.index = [x * 2 for x in range(0, 4803)]


print(tmdbDataSet.index.name)
tmdbDataSet.index.name = 'movie_index'
print(tmdbDataSet.index.name)


tmdbDataSet_date_index.vote_count.plot()
plt.show()


tmdbDataSet_multi_index = tmdbDataSet.set_index(['release_date', 'status'])


tmdbDataSet_multi_index = tmdbDataSet_multi_index.sort_index()


print(tmdbDataSet_multi_index.loc[('1916-09-04','Released')])


print(tmdbDataSet_multi_index.loc[(['1916-09-04', '2010-03-03'],'Released'), :])


print(tmdbDataSet_multi_index.loc[(slice(None), 'Rumored'), :])


# swapping index of multilevel index
tmdbDataSet_multi_index_swap = tmdbDataSet_multi_index.swaplevel(0,1)
print(tmdbDataSet_multi_index_swap.head())

# Resetting Index
tmdbDataSet_original = tmdbDataSet_multi_index_swap.reset_index()
print(tmdbDataSet_original.head())


# Stacking and Unstcking


print(tmdbDataSet_multi_index_unsatck.tail())


tmdbDataSet_dateIndex = pd.read_csv('tmdb_5000_movies.csv', 
                                    parse_dates=True, index_col='release_date')


print(tmdbDataSet_dateIndex['vote_count'].resample('D').count())


# Vote count movies got in August 2009
count = tmdbDataSet_dateIndex['vote_count']['2009-Aug']


print(count.resample('D').max())


# Down sampling
print(tmdbDataSet_date_index.resample('A').sum())


# Up sampling
print(tmdbDataSet_date_index.resample('H').sum())
print(tmdbDataSet_date_index.resample('4H').sum())


print(tmdbDataSet_date_index.resample('A').sum().count())


print(tmdbDataSet.groupby('release_date').count())


# Sum of budget made on each day by 5000 movies
print(tmdbDataSet.groupby('release_date')['budget'].sum())


# Grouping by the mutiple columns
print(tmdbDataSet.groupby(['release_date', 'runtime'])[['popularity', 'budget']].sum())


# Multiple aggregations
print(tmdbDataSet.groupby(['release_date', 'runtime'])[['popularity', 'budget']].agg(['sum', 'count']))


print(tmdbDataSet.vote_average.floordiv(2))


print(tmdbDataSet.status.str.upper())


tmdbDataSet_dropped = tmdbDataSet.drop(['production_countries'], axis='columns')
print(tmdbDataSet_dropped.head())


# Popularity of movie depends on vote_average, revenue - budget, popularity
tmdbDataSet_date_index['profit'] = tmdbDataSet_date_index['revenue'
                                                         ] - tmdbDataSet_date_index['budget']
tmdbDataSet_date_index_grouped = tmdbDataSet_date_index.groupby('title')
tmdbDataSet_date_index_grouped_sub = tmdbDataSet_date_index_grouped[['vote_average',
                                                                     'profit', 'popularity']]


# Max and min value in columns
agg_results = tmdbDataSet_date_index_grouped_sub.agg(['max', 'min'])
print(agg_results)


# Extract year of release_date and set release_date column as index
tmdbDataSet_agg = tmdbDataSet.copy()
tmdbDataSet_agg['year'] = pd.to_datetime(tmdbDataSet_agg['release_date']).dt.year
tmdbDataSet_agg['year']= tmdbDataSet_agg['year'].fillna(0.0).astype(int)
tmdbDataSet_agg.set_index('year', inplace=True)
tmdbDataSet_agg['profit'] = tmdbDataSet_agg['revenue'
                                                         ] - tmdbDataSet_agg['budget']


def countMovies(series):
    return series.count()
aggObject = {'profit':'sum', 'title': countMovies }
tmdbDataSet_grouped = tmdbDataSet_agg.groupby(['year']).agg(aggObject)


tmdbDataSet_grouped = tmdbDataSet_grouped.drop([0], axis='rows')
tmdbDataSet_grouped['y'] = list(tmdbDataSet_grouped.index)
tmdbDataSet_grouped.plot(kind='scatter', y='y', x='profit', 
                         s=tmdbDataSet_grouped.title)


tmdbDataSet_grouped.head()


grouped_dataset = tmdbDataSet.groupby(['original_language'])


def find_en_lang(series):
    if series.values[0] == 'en':
        return 'english'
    else:
        return 'non-english'


## Transforming data
tmdbDataSet.loc[:,'original_language_en'] = grouped_dataset.original_language.transform(
    find_en_lang)
display(tmdbDataSet.head())


## Converting runtime to hours
tmdbDataSet.loc[:,'runtime_hours'] = tmdbDataSet.apply(lambda x:x['runtime']/60, axis=1)
display(tmdbDataSet.head())


print(grouped_dataset['budget'].sum())


# Movies whose budget is greater than 25cr
display(grouped_dataset.filter(lambda x: x['budget'].sum() > 250000000))


language = (tmdbDataSet['original_language'] == 'en').map({True:'English', 
                                                           False:'Non-English'})
display(tmdbDataSet.groupby(language)['budget', 'original_language'].mean())

Importing Packages¶

Reading data¶

Creating a dataset from lists¶

Using plot function of pandas¶

Pandas with time series¶

Adding mutiple Indexes to dataset¶

Parsing date column while importing data¶

Resampling Dataset¶

Filtering and chaining methods¶

Grouping of data¶

Tranforming data¶

Aggregating results¶

Transform and Apply Function¶

Using map function¶