{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Let's work on Visualizations!\n", "# The following is based on a kaggle tutorial on visualization\n", "# We will use pandas plotting tools\n", "# Here we will only discuss \"univariate\" visualizations. \n", "import pandas as pd\n", "import numpy as np\n", "\n", "# This dataset contains description and reviews of 150K different wines.\n", "# It contains 10 columns of wine reviews.\n", "# The goal is to analyze this data to predict good wines from these reviews. \n", "reviews = pd.read_csv(\"../Data/wine-reviews/winemag-data_first150k.csv\", index_col=0)\n", "reviews.head(3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# The top 10 wine-producing regions of the world?\n", "reviews['province'].value_counts().head(10)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# I bet you did not know that California is 4 times bigger than \n", "# any other region in the world in number of wines produced\n", "# Let's do our first plot \n", "# This one is a BAR-PLOT, perfect for categorical/nominal scales\n", "reviews['province'].value_counts().head(10).plot.bar()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Let's redo the above bar plot, but with percentages or fractions\n", "(reviews['province'].value_counts().head(10) / len(reviews)).plot.bar()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Now, let's do a distribution of \"points\" ratings for all the wines\n", "# Points ratings for the listed wines range from 80-100\n", "# This is a BAR-PLOT, since you have bars for every value possible.\n", "# In HISTOGRAMS, values are grouped into ranges\n", "reviews['points'].value_counts().sort_index().plot.bar()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "reviews['points'].value_counts().sort_index().plot.line()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "reviews['points'].value_counts().sort_index().plot.area()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Histograms are bar-plots for interval ranges\n", "# The following plot looks at the \"price\" column \n", "# and counts number of wines within price ranges\n", "reviews[reviews['price'] < 200]['price'].plot.hist()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# A major drawback of histograms is that they break space up into even intervals.\n", "# Consequently, they don't deal very well with skewed data.\n", "# This becomes clear when you do the following:\n", "reviews['price'].plot.hist()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# In fact, the distribution is so skewed that:\n", "reviews[reviews['price'] > 1500]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# What percentage of wines are prices over $250?\n", "len(reviews[reviews['price'] > 250])/len(reviews) * 100" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }