{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Let's work on Visualizations!\n",
    "# The following is based on a kaggle tutorial on visualization\n",
    "# We will use pandas plotting tools\n",
    "# Here we will only discuss \"univariate\" visualizations. \n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "# This dataset contains description and reviews of 150K different wines.\n",
    "# It contains 10 columns of wine reviews.\n",
    "# The goal is to analyze this data to predict good wines from these reviews. \n",
    "reviews = pd.read_csv(\"../Data/wine-reviews/winemag-data_first150k.csv\", index_col=0)\n",
    "reviews.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# The top 10 wine-producing regions of the world?\n",
    "reviews['province'].value_counts().head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# I bet you did not know that California is 4 times bigger than \n",
    "# any other region in the world in number of wines produced\n",
    "# Let's do our first plot \n",
    "# This one is a BAR-PLOT, perfect for categorical/nominal scales\n",
    "reviews['province'].value_counts().head(10).plot.bar()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Let's redo the above bar plot, but with percentages or fractions\n",
    "(reviews['province'].value_counts().head(10) / len(reviews)).plot.bar()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Now, let's do a distribution of \"points\" ratings for all the wines\n",
    "# Points ratings for the listed wines range from 80-100\n",
    "# This is a BAR-PLOT, since you have bars for every value possible.\n",
    "# In HISTOGRAMS, values are grouped into ranges\n",
    "reviews['points'].value_counts().sort_index().plot.bar()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "reviews['points'].value_counts().sort_index().plot.line()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "reviews['points'].value_counts().sort_index().plot.area()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Histograms are bar-plots for interval ranges\n",
    "# The following plot looks at the \"price\" column \n",
    "# and counts number of wines within price ranges\n",
    "reviews[reviews['price'] < 200]['price'].plot.hist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# A major drawback of histograms is that they break space up into even intervals.\n",
    "# Consequently, they don't deal very well with skewed data.\n",
    "# This becomes clear when you do the following:\n",
    "reviews['price'].plot.hist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# In fact, the distribution is so skewed that:\n",
    "reviews[reviews['price'] > 1500]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# What percentage of wines are prices over $250?\n",
    "len(reviews[reviews['price'] > 250])/len(reviews) * 100"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}