In [None]:
# Let's work on Visualizations!
# The following is based on a kaggle tutorial on visualization
# We will use pandas plotting tools
# Here we will only discuss "univariate" visualizations. 
import pandas as pd
import numpy as np

# This dataset contains description and reviews of 150K different wines.
# It contains 10 columns of wine reviews.
# The goal is to analyze this data to predict good wines from these reviews. 
reviews = pd.read_csv("../Data/wine-reviews/winemag-data_first150k.csv", index_col=0)
reviews.head(3)

In [None]:
# The top 10 wine-producing regions of the world?
reviews['province'].value_counts().head(10)

In [None]:
# I bet you did not know that California is 4 times bigger than 
# any other region in the world in number of wines produced
# Let's do our first plot 
# This one is a BAR-PLOT, perfect for categorical/nominal scales
reviews['province'].value_counts().head(10).plot.bar()

In [None]:
# Let's redo the above bar plot, but with percentages or fractions
(reviews['province'].value_counts().head(10) / len(reviews)).plot.bar()

In [None]:
# Now, let's do a distribution of "points" ratings for all the wines
# Points ratings for the listed wines range from 80-100
# This is a BAR-PLOT, since you have bars for every value possible.
# In HISTOGRAMS, values are grouped into ranges
reviews['points'].value_counts().sort_index().plot.bar()

In [None]:
reviews['points'].value_counts().sort_index().plot.line()

In [None]:
reviews['points'].value_counts().sort_index().plot.area()

In [None]:
# Histograms are bar-plots for interval ranges
# The following plot looks at the "price" column 
# and counts number of wines within price ranges
reviews[reviews['price'] < 200]['price'].plot.hist()

In [None]:
# A major drawback of histograms is that they break space up into even intervals.
# Consequently, they don't deal very well with skewed data.
# This becomes clear when you do the following:
reviews['price'].plot.hist()

In [None]:
# In fact, the distribution is so skewed that:
reviews[reviews['price'] > 1500]

In [None]:
# What percentage of wines are prices over $250?
len(reviews[reviews['price'] > 250])/len(reviews) * 100