import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.mlab as mlab
import matplotlib
plt.style.use('ggplot')
from matplotlib.pyplot import figure
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12,8)


df1 = pd.read_csv(r'C:\Users\hunte\Desktop\Movie_Data\movies_gross.csv')
df2 = pd.read_csv(r'C:\Users\hunte\Desktop\Movie_Data\movies_oscars.csv', encoding = "utf-8")


df1.head()


df2.head()


df_best_pic = df2[(df2.Winner == 'Y') & (df2.Award == 'Picture')]


df_best_pic.head(5)


df1['Title'] = df1['Title'].str.replace(r'\(\d{4}\)','',regex=True)
df1 = df1.drop(['Movie Info','Distributor'], axis=1)


df1.head()


df1['Title'] = df1['Title'].str.strip()


df1['Movie Runtime'] = df1['Movie Runtime'].str.replace('(\d hr)$', lambda m: m.group(1) + ' 0 min' , regex=True)
df1['Movie Runtime'] = df1['Movie Runtime'].str.replace('(\d+) hr (\d{1,2})', lambda m: str(int(m.group(1)) * 60 + int(m.group(2))), regex=True)
df1['Movie Runtime'] = df1['Movie Runtime'].str.replace(r'min','',regex=True)
df1.head()


the_titles_list = df_best_pic['Title'].tolist()


filt = df1['Title'].isin(the_titles_list)

df1_best_picture = df1.loc[filt,]


df1_best_picture


df1['Release Year'] = df1['Release Date'].str.strip().str[-4:]
df1_release_year = df1.loc[filt,['Title','Release Year','Domestic Sales (in $)','International Sales (in $)','Movie Runtime']].sort_values(by="Release Year").dropna()
df1_release_year


pd.options.display.float_format = '{:,.2f}'.format
df1_release_year.describe()


filt2 = (df1['Title'].str.len() >  1)
top_domestic = df1.loc[filt2,['Title','Release Year','Domestic Sales (in $)','International Sales (in $)','Genre','Movie Runtime','License']].sort_values(by="Domestic Sales (in $)",ascending=False)
top_domestic = top_domestic.head(100)
top_domestic.head()


top_domestic.shape

(100, 7)


top_international = df1.loc[filt2,['Title','Release Year','Domestic Sales (in $)','International Sales (in $)','Genre','Movie Runtime','License']].sort_values(by="International Sales (in $)",ascending=False)
top_international = top_international.head(100)
top_international


top_international.shape

(100, 7)


import re
from collections import Counter
master_list = []
the_count = 0
for g in top_international['Genre']:
    master_list.append(g)
    the_count = the_count + 1

x = top_international['Genre'].to_list()
the_count = 0

word_set = set()
word_list = []
for c in range(len(master_list)):
    for i in x[c].split(", "):
        y = re.sub("[\[\]]","",i)
        the_count = the_count + 1
        word_set.add(y)
        word_list.append(y)


word_dictionary = {}
for w in word_set:
    d = Counter(word_list)
    word_dictionary[w] = d[w]

# sort the dictionary by values
sorted_temp = sorted(word_dictionary.items(), key=lambda x:x[1],reverse=True)
sorted_word_dictionary = dict(sorted_temp)

#for key, value in sorted_word_dictionary.items() :
 #   print (key)


master_list2 = []
the_count2 = 0
for g2 in top_domestic['Genre']:
    master_list2.append(g2)
    the_count2 = the_count2 + 1

x2 = top_domestic['Genre'].to_list()
the_count2 = 0


word_set2 = set()
word_list2 = []
for c2 in range(len(master_list2)):
    for i2 in x2[c2].split(", "):
        y2 = re.sub("[\[\]]","",i2)
        the_count2 = the_count2 + 1
        word_set2.add(y2)
        word_list2.append(y2)


word_dictionary2 = {}
for w2 in word_set2:
    d2 = Counter(word_list2)
    word_dictionary2[w2] = d2[w2]

# sort the dictionary by values
sorted_temp2 = sorted(word_dictionary2.items(), key=lambda x:x[1],reverse=True)
sorted_word_dictionary2 = dict(sorted_temp2)
#print(sorted_word_dictionary2)

#for key, value in sorted_word_dictionary2.items() :
#    print (key)


print(sorted_word_dictionary)   # international movies

{"'Adventure'": 91, "'Action'": 58, "'Fantasy'": 44, "'Sci-Fi'": 41, "'Family'": 36, "'Comedy'": 23, "'Animation'": 22, "'Thriller'": 16, "'Drama'": 15, "'Mystery'": 11, "'Crime'": 9, "'Romance'": 7, "'Musical'": 6, "'Music'": 2, "'Biography'": 1}


print(sorted_word_dictionary2)   # domestic movies

{"'Adventure'": 90, "'Action'": 66, "'Sci-Fi'": 53, "'Fantasy'": 43, "'Family'": 27, "'Comedy'": 23, "'Drama'": 21, "'Animation'": 17, "'Thriller'": 11, "'Romance'": 9, "'Mystery'": 7, "'Crime'": 6, "'Musical'": 6, "'War'": 2, "'Horror'": 1, "'Biography'": 1}


df3=pd.DataFrame({'x':sorted_word_dictionary.keys(), 'y': sorted_word_dictionary.values()})
df4=pd.DataFrame({'x':sorted_word_dictionary2.keys(), 'y': sorted_word_dictionary2.values()})


df3['Key']='International'
df4['Key']='Domestic'
res=pd.concat([df3,df4])
sns.barplot(x='x',y='y',data=res,hue='Key',order = df4['x']) # df3['x'] sorts decending red (intl), change to df4 to sort blue (domestic)
plt.xticks(rotation=45)
plt.xlabel("Genres")
plt.ylabel("Count")
plt.title("Genres of Highest Grossing Movies")
plt.show()


max_absolute_change = 0
max_percentage_change = 0

for i in word_set:
    #print(i)
    # Music is not in both of them, so I am ignoring it
    if (i == "'Music'"):
        continue
    #print(sorted_word_dictionary[i])
    #print(sorted_word_dictionary2[i])
    x = sorted_word_dictionary[i]
    y = sorted_word_dictionary2[i]
    max_value = max(x,y)
    min_value = min(x,y)
    change = max_value - min_value
    #print("absolute change is ",change)
    percent_increase = ((max_value - min_value)/min_value) * 100
    #print("Percent change for ", i, " is: " , round(percent_increase,2))
    #print("mac is ",max_absolute_change)
    if (change > max_absolute_change):
        max_absolute_change = change
        biggest_absolute_change = i
        biggest_absolute_change_value = max_absolute_change
    if (percent_increase > max_percentage_change):
        max_percentage_change = percent_increase
        biggest_percent_change = i
        biggest_percent_change_value = max_percentage_change
        
print("Largest absolute change is ",biggest_absolute_change," with a change of",biggest_absolute_change_value)
print("Largest percentage change is ",biggest_percent_change," with a change of", "{:.2f}".format(biggest_percent_change_value),"%.")

Largest absolute change is  'Sci-Fi'  with a change of 12
Largest percentage change is  'Mystery'  with a change of 57.14 %.


filt3 = top_domestic['Title'].isin(the_titles_list)
df1_best_picture_gross = top_domestic.loc[filt3,]
df1_best_picture_gross.shape

(3, 7)


df1_best_picture_gross


df1['License'].value_counts().plot.bar(title="Highest Grossing Movie Ratings")

<AxesSubplot:title={'center':'Highest Grossing Movie Ratings'}>


df1['License'].value_counts()

PG-13    363
R        194
PG       173
G         14
Name: License, dtype: int64


df1['License'].value_counts(normalize=True)

PG-13   0.49
R       0.26
PG      0.23
G       0.02
Name: License, dtype: float64

	Unnamed: 0	Title	Movie Info	Distributor	Release Date	Domestic Sales (in $)	International Sales (in $)	World Sales (in $)	Genre	Movie Runtime	License
0	0	Star Wars: Episode VII - The Force Awakens (2015)	As a new threat to the galaxy rises, Rey, a de...	Walt Disney Studios Motion Pictures	December 16, 2015	936662225	1132859475	2069521700	['Action', 'Adventure', 'Sci-Fi']	2 hr 18 min	PG-13
1	1	Avengers: Endgame (2019)	After the devastating events of Avengers: Infi...	Walt Disney Studios Motion Pictures	April 24, 2019	858373000	1939128328	2797501328	['Action', 'Adventure', 'Drama', 'Sci-Fi']	3 hr 1 min	PG-13
2	2	Avatar (2009)	A paraplegic Marine dispatched to the moon Pan...	Twentieth Century Fox	December 16, 2009	760507625	2086738578	2847246203	['Action', 'Adventure', 'Fantasy', 'Sci-Fi']	2 hr 42 min	PG-13
3	3	Black Panther (2018)	T'Challa, heir to the hidden but advanced king...	Walt Disney Studios Motion Pictures	NaN	700426566	647171407	1347597973	['Action', 'Adventure', 'Sci-Fi']	2 hr 14 min	NaN
4	4	Avengers: Infinity War (2018)	The Avengers and their allies must be willing ...	Walt Disney Studios Motion Pictures	NaN	678815482	1369544272	2048359754	['Action', 'Adventure', 'Sci-Fi']	2 hr 29 min	NaN

	Title	Winner	Award	Year
0	Nomadland	Y	Picture	2020
1	Judas and the Black Messiah	N	Picture	2020
2	Mank	N	Picture	2020
3	Minari	N	Picture	2020
4	Promising Young Woman	N	Picture	2020

	Title	Winner	Award	Year
0	Nomadland	Y	Picture	2020
28	Parasite	Y	Picture	2019
57	Green Book	Y	Picture	2018
85	The Shape of Water	Y	Picture	2017
114	Moonlight	Y	Picture	2016

	Unnamed: 0	Title	Release Date	Domestic Sales (in $)	International Sales (in $)	World Sales (in $)	Genre	Movie Runtime	License
0	0	Star Wars: Episode VII - The Force Awakens	December 16, 2015	936662225	1132859475	2069521700	['Action', 'Adventure', 'Sci-Fi']	2 hr 18 min	PG-13
1	1	Avengers: Endgame	April 24, 2019	858373000	1939128328	2797501328	['Action', 'Adventure', 'Drama', 'Sci-Fi']	3 hr 1 min	PG-13
2	2	Avatar	December 16, 2009	760507625	2086738578	2847246203	['Action', 'Adventure', 'Fantasy', 'Sci-Fi']	2 hr 42 min	PG-13
3	3	Black Panther	NaN	700426566	647171407	1347597973	['Action', 'Adventure', 'Sci-Fi']	2 hr 14 min	NaN
4	4	Avengers: Infinity War	NaN	678815482	1369544272	2048359754	['Action', 'Adventure', 'Sci-Fi']	2 hr 29 min	NaN

	Unnamed: 0	Title	Release Date	Domestic Sales (in $)	International Sales (in $)	World Sales (in $)	Genre	Movie Runtime	License
0	0	Star Wars: Episode VII - The Force Awakens	December 16, 2015	936662225	1132859475	2069521700	['Action', 'Adventure', 'Sci-Fi']	138	PG-13
1	1	Avengers: Endgame	April 24, 2019	858373000	1939128328	2797501328	['Action', 'Adventure', 'Drama', 'Sci-Fi']	181	PG-13
2	2	Avatar	December 16, 2009	760507625	2086738578	2847246203	['Action', 'Adventure', 'Fantasy', 'Sci-Fi']	162	PG-13
3	3	Black Panther	NaN	700426566	647171407	1347597973	['Action', 'Adventure', 'Sci-Fi']	134	NaN
4	4	Avengers: Infinity War	NaN	678815482	1369544272	2048359754	['Action', 'Adventure', 'Sci-Fi']	149	NaN

Analyzing Best Picture Winners

Before I try to answer these questions, I first need to import the data, examine it, and do some cleaning of the data.¶

Filter df2 to only Best Picture winners¶

Remove the years in parentheses from the Title column in df1 and two drop uneeded columns¶

Remove extra space at end of Title¶

Add '0 min' to movies that do not have minutes, then convert the runtime to minutes only, and remove 'min'¶

Put all the Best Picture winning titles in a list to use as a filter in the df1 dataframe¶

Modify and rename the Release Date column to just show the release year, then create a new dataframe organized by Release Year¶

Format the data to display with 2 decimal places, add 'thousand-separator' commas, and then view a summary of the movie sales¶

Get the top 100 films based on domestic sales, and the top 100 based on international sales, and then compare the two.¶

The Genre column for both top_domestic and top_international are lists of values. To compare the two dataframes using that column, I need to break the list apart and get a count of each unique value¶

Visualize the difference in genres between top 100 films by domestic box office and top 100 films by international box office.¶

From the chart, it appears as if 'Sci-Fri' might have the biggest difference between domestic and international sales, but below I will use code to determine the difference in both absolute terms and as a percentage.¶

Here we see that of the top 100 highest grossing films, only 3 of them won the Oscar for Best Picture¶

In the chart below, we see that PG-13 movies tend to perform best at the box office, while films G-rated films bring in less money, on average.¶

Conclusion¶

In analyzing this movie data, I have shown that:¶

1. In general, movie-goers all over the world tend to have similar tastes in movies.¶

2. What is considered a great film by the Academy of Motion Pictures Arts and Sciences do not often coincide with what the general public pays money to see in theaters.¶

3. G-rated movies are the least popular movies in terms of box office revenue, while PG-13 are the most popular¶

	Unnamed: 0	Title	Release Date	Domestic Sales (in $)	International Sales (in $)	World Sales (in $)	Genre	Movie Runtime	License
6	6	Titanic	December 19, 1997	659363944	1542283320	2201647264	['Drama', 'Romance']	194	PG-13
45	45	The Lord of the Rings: The Return of the King	December 17, 2003	377845905	768185007	1146030912	['Action', 'Adventure', 'Drama', 'Fantasy']	201	PG-13
67	67	Forrest Gump	July 6, 1994	330455270	347770863	678226133	['Drama', 'Romance']	142	PG-13
238	238	Gladiator	May 4, 2000	187705427	277675375	465380802	['Action', 'Adventure', 'Drama']	155	R
245	245	Dances with Wolves	November 9, 1990	184208848	240000000	424208848	['Adventure', 'Drama', 'Western']	181	NaN
289	289	Rain Man	December 16, 1988	172825435	182000000	354825435	['Drama']	133	NaN
297	297	A Beautiful Mind	December 21, 2001	170742341	146048916	316791257	['Biography', 'Drama']	135	PG-13
298	298	Chicago	February 26, 2002	170687518	136089214	306776732	['Comedy', 'Crime', 'Musical']	113	PG-13
409	409	Slumdog Millionaire	November 12, 2008	141319928	237090614	378410542	['Drama', 'Romance']	120	R
421	421	The King's Speech	November 26, 2010	138797449	288576868	427374317	['Biography', 'Drama', 'History']	118	R
424	424	Platoon	December 19, 1986	138530565	15067	138545632	['Drama', 'War']	120	NaN
436	436	Argo	October 11, 2012	136025503	96300000	232325503	['Biography', 'Drama', 'Thriller']	120	R
441	441	The Godfather	March 15, 1972	134966411	111154575	246120986	['Crime', 'Drama']	175	NaN
457	457	The Departed	October 5, 2006	132384315	159081058	291465373	['Crime', 'Drama', 'Thriller']	151	R
465	465	The Silence of the Lambs	February 14, 1991	130742922	142000000	272742922	['Crime', 'Drama', 'Horror', 'Thriller']	118	NaN
473	473	American Beauty	September 15, 1999	130096601	226200000	356296601	['Drama']	122	R
621	621	Terms of Endearment	November 23, 1983	108423489	260	108423749	['Comedy', 'Drama']	132	NaN
696	696	Unforgiven	NaN	101157447	58000000	159157447	['Drama', 'Western']	130	NaN
712	712	Million Dollar Baby	December 15, 2004	100492203	116271443	216763646	['Drama', 'Sport']	132	PG-13
718	718	Shakespeare in Love	December 11, 1998	100317794	189000000	289317794	['Comedy', 'Drama', 'History', 'Romance']	123	R
740	740	Schindler's List	December 15, 1993	96898818	225262427	322161245	['Biography', 'Drama', 'History']	195	R
851	851	Green Book	November 16, 2018	85080171	236672485	321752656	['Biography', 'Comedy', 'Drama', 'Music']	130	PG-13

	Title	Release Year	Domestic Sales (in $)	International Sales (in $)	Movie Runtime
441	The Godfather	1972	134966411	111154575	175
621	Terms of Endearment	1983	108423489	260	132
424	Platoon	1986	138530565	15067	120
289	Rain Man	1988	172825435	182000000	133
245	Dances with Wolves	1990	184208848	240000000	181
465	The Silence of the Lambs	1991	130742922	142000000	118
740	Schindler's List	1993	96898818	225262427	195
67	Forrest Gump	1994	330455270	347770863	142
6	Titanic	1997	659363944	1542283320	194
718	Shakespeare in Love	1998	100317794	189000000	123
473	American Beauty	1999	130096601	226200000	122
238	Gladiator	2000	187705427	277675375	155
297	A Beautiful Mind	2001	170742341	146048916	135
298	Chicago	2002	170687518	136089214	113
45	The Lord of the Rings: The Return of the King	2003	377845905	768185007	201
712	Million Dollar Baby	2004	100492203	116271443	132
457	The Departed	2006	132384315	159081058	151
409	Slumdog Millionaire	2008	141319928	237090614	120
421	The King's Speech	2010	138797449	288576868	118
436	Argo	2012	136025503	96300000	120
851	Green Book	2018	85080171	236672485	130

	Domestic Sales (in $)	International Sales (in $)
count	21.00	21.00
mean	182,281,469.38	269,889,404.38
std	130,677,071.29	330,263,121.17
min	85,080,171.00	260.00
25%	130,096,601.00	136,089,214.00
50%	138,530,565.00	189,000,000.00
75%	172,825,435.00	240,000,000.00
max	659,363,944.00	1,542,283,320.00

	Title	Release Year	Domestic Sales (in $)	International Sales (in $)	Genre	Movie Runtime	License
0	Star Wars: Episode VII - The Force Awakens	2015	936662225	1132859475	['Action', 'Adventure', 'Sci-Fi']	138	PG-13
1	Avengers: Endgame	2019	858373000	1939128328	['Action', 'Adventure', 'Drama', 'Sci-Fi']	181	PG-13
2	Avatar	2009	760507625	2086738578	['Action', 'Adventure', 'Fantasy', 'Sci-Fi']	162	PG-13
3	Black Panther	NaN	700426566	647171407	['Action', 'Adventure', 'Sci-Fi']	134	NaN
4	Avengers: Infinity War	NaN	678815482	1369544272	['Action', 'Adventure', 'Sci-Fi']	149	NaN

Analyzing Best Picture Winners

In this notebook, I will answer the following movie-related questions.¶

Before I try to answer these questions, I first need to import the data, examine it, and do some cleaning of the data.¶

Filter df2 to only Best Picture winners¶

Remove the years in parentheses from the Title column in df1 and two drop uneeded columns¶

Remove extra space at end of Title¶

Add '0 min' to movies that do not have minutes, then convert the runtime to minutes only, and remove 'min'¶

Put all the Best Picture winning titles in a list to use as a filter in the df1 dataframe¶

Modify and rename the Release Date column to just show the release year, then create a new dataframe organized by Release Year¶

Format the data to display with 2 decimal places, add 'thousand-separator' commas, and then view a summary of the movie sales¶

Get the top 100 films based on domestic sales, and the top 100 based on international sales, and then compare the two.¶

The Genre column for both top_domestic and top_international are lists of values. To compare the two dataframes using that column, I need to break the list apart and get a count of each unique value¶

Visualize the difference in genres between top 100 films by domestic box office and top 100 films by international box office.¶

From the chart, it appears as if 'Sci-Fri' might have the biggest difference between domestic and international sales, but below I will use code to determine the difference in both absolute terms and as a percentage.¶

Here we see that of the top 100 highest grossing films, only 3 of them won the Oscar for Best Picture¶

In the chart below, we see that PG-13 movies tend to perform best at the box office, while films G-rated films bring in less money, on average.¶

Conclusion¶

In analyzing this movie data, I have shown that:¶

1. In general, movie-goers all over the world tend to have similar tastes in movies.¶

2. What is considered a great film by the Academy of Motion Pictures Arts and Sciences do not often coincide with what the general public pays money to see in theaters.¶

3. G-rated movies are the least popular movies in terms of box office revenue, while PG-13 are the most popular¶