{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "toc_visible": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "markdown", "source": [ "Reading the file" ], "metadata": { "id": "-R6YEjd_ycHW" } }, { "cell_type": "markdown", "source": [], "metadata": { "id": "sq8V38lNyoRl" } }, { "cell_type": "code", "source": [ "import pandas as pd\n", "import numpy as np\n", "df= pd.read_csv('unclean_data1.csv')" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 269 }, "id": "6DzTcIw9jkvg", "outputId": "056f6f91-f899-4b3b-9669-a2fc949a6882" }, "execution_count": null, "outputs": [ { "output_type": "error", "ename": "UnicodeDecodeError", "evalue": "ignored", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mUnicodeDecodeError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mpandas\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mnumpy\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mdf\u001b[0m\u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'unclean_data1.csv'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/util/_decorators.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 209\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 210\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mnew_arg_name\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnew_arg_value\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 211\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 212\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 213\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mcast\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mF\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/util/_decorators.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 329\u001b[0m \u001b[0mstacklevel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfind_stack_level\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 330\u001b[0m )\n\u001b[0;32m--> 331\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 332\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 333\u001b[0m \u001b[0;31m# error: \"Callable[[VarArg(Any), KwArg(Any)], Any]\" has no\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/io/parsers/readers.py\u001b[0m in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)\u001b[0m\n\u001b[1;32m 948\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkwds_defaults\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 949\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 950\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_read\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 951\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 952\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/io/parsers/readers.py\u001b[0m in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 603\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 604\u001b[0m \u001b[0;31m# Create the parser.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 605\u001b[0;31m \u001b[0mparser\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mTextFileReader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 606\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 607\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mchunksize\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0miterator\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/io/parsers/readers.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m 1440\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1441\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhandles\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mIOHandles\u001b[0m \u001b[0;34m|\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1442\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_make_engine\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mengine\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1443\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1444\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mclose\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/io/parsers/readers.py\u001b[0m in \u001b[0;36m_make_engine\u001b[0;34m(self, f, engine)\u001b[0m\n\u001b[1;32m 1751\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1752\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1753\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mmapping\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mengine\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1754\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1755\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhandles\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/io/parsers/c_parser_wrapper.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, src, **kwds)\u001b[0m\n\u001b[1;32m 77\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 78\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"dtype\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mensure_dtype_objs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"dtype\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 79\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_reader\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mparsers\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTextReader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msrc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 80\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 81\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munnamed_cols\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_reader\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munnamed_cols\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader.__cinit__\u001b[0;34m()\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._get_header\u001b[0;34m()\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._tokenize_rows\u001b[0;34m()\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.raise_parser_error\u001b[0;34m()\u001b[0m\n", "\u001b[0;31mUnicodeDecodeError\u001b[0m: 'utf-8' codec can't decode byte 0xff in position 279: invalid start byte" ] } ] }, { "cell_type": "code", "source": [ "df= pd.read_csv(\"unclean_data1.csv\",encoding='iso-8859-1')" ], "metadata": { "id": "0nb6y9FqGB-4" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "# New section" ], "metadata": { "id": "w1TodbnDjlvS" } }, { "cell_type": "code", "source": [ "df1 = pd.read_csv(\"unclean_data2.csv\")\n", "df1.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 330 }, "id": "OToz6KGftI_i", "outputId": "a02ad415-586e-4bf8-aa5c-37fdd4d8c8b4" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " 1 movie_title num_critic_for_reviews \\\n", "0 2 Avatar?ÿ 723 \n", "1 3 Pirates of the Caribbean: At World's End?ÿ 302 \n", "2 4 Spectre?ÿ 602 \n", "3 5 The Dark Knight Rises?ÿ 813 \n", "4 6 John Carter?ÿ 462 \n", "\n", " duration DIRECTOR_facebook_likes actor_3_facebook_likes \\\n", "0 178.0 10 855 \n", "1 NaN 563 1000 \n", "2 148.0 20 161 \n", "3 NaN 22000 23000 \n", "4 132.0 \"475\" 530 \n", "\n", " ACTOR_1_facebook_likes gross num_voted_users \\\n", "0 1000 760505847 886204.0 \n", "1 40000 309404152 471220.0 \n", "2 11000 200074175 275868.0 \n", "3 27000 448130642 1144337.0 \n", "4 640 73058679 212204.0 \n", "\n", " Cast_Total_facebook_likes facenumber_in_poster num_user_for_reviews \\\n", "0 4834.0 NaN 3054 \n", "1 48350.0 NaN 1238 \n", "2 11700.0 1.0 994 \n", "3 106759.0 NaN 2701 \n", "4 1873.0 1.0 738 \n", "\n", " budget title_year ACTOR_2_facebook_likes imdb_score title_year.1 \n", "0 237000000 2009 936.0 7.9 2009.0 \n", "1 300000000 2007 5000.0 7.1 NaN \n", "2 245000000 2015 393.0 6.8 2015.0 \n", "3 250000000 2012 23000.0 8.5 NaN \n", "4 263700000 2012 632.0 6.6 NaN " ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
1movie_titlenum_critic_for_reviewsdurationDIRECTOR_facebook_likesactor_3_facebook_likesACTOR_1_facebook_likesgrossnum_voted_usersCast_Total_facebook_likesfacenumber_in_posternum_user_for_reviewsbudgettitle_yearACTOR_2_facebook_likesimdb_scoretitle_year.1
02Avatar?ÿ723178.0108551000760505847886204.04834.0NaN30542370000002009936.07.92009.0
13Pirates of the Caribbean: At World's End?ÿ302NaN563100040000309404152471220.048350.0NaN123830000000020075000.07.1NaN
24Spectre?ÿ602148.02016111000200074175275868.011700.01.09942450000002015393.06.82015.0
35The Dark Knight Rises?ÿ813NaN2200023000270004481306421144337.0106759.0NaN2701250000000201223000.08.5NaN
46John Carter?ÿ462132.0\"475\"53064073058679212204.01873.01.07382637000002012632.06.6NaN
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 8 } ] }, { "cell_type": "markdown", "source": [ "Inconsistent column names\n", "\n", "Change cases\n", "Rename them" ], "metadata": { "id": "8G-qYt5ayq8_" } }, { "cell_type": "markdown", "source": [ "Change the case to upper" ], "metadata": { "id": "06uim5PWzAPG" } }, { "cell_type": "code", "source": [ "df.columns" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Znc15wC2y8CI", "outputId": "273f044c-4c3f-4792-a117-a93946ebbc6c" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "Index(['1', 'movie_title', 'num_critic_for_reviews', 'duration',\n", " 'DIRECTOR_facebook_likes', 'actor_3_facebook_likes',\n", " 'ACTOR_1_facebook_likes', 'gross', 'num_voted_users',\n", " 'Cast_Total_facebook_likes', 'facenumber_in_poster',\n", " 'num_user_for_reviews', 'budget', 'title_year',\n", " 'ACTOR_2_facebook_likes', 'imdb_score', 'title_year.1'],\n", " dtype='object')" ] }, "metadata": {}, "execution_count": 9 } ] }, { "cell_type": "code", "source": [ "df.columns.str.upper()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "e1U1GwrFzJCu", "outputId": "8021e5e6-73e6-4ec1-f5a8-064a762a3ed5" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "Index(['1', 'MOVIE_TITLE', 'NUM_CRITIC_FOR_REVIEWS', 'DURATION',\n", " 'DIRECTOR_FACEBOOK_LIKES', 'ACTOR_3_FACEBOOK_LIKES',\n", " 'ACTOR_1_FACEBOOK_LIKES', 'GROSS', 'NUM_VOTED_USERS',\n", " 'CAST_TOTAL_FACEBOOK_LIKES', 'FACENUMBER_IN_POSTER',\n", " 'NUM_USER_FOR_REVIEWS', 'BUDGET', 'TITLE_YEAR',\n", " 'ACTOR_2_FACEBOOK_LIKES', 'IMDB_SCORE', 'TITLE_YEAR.1'],\n", " dtype='object')" ] }, "metadata": {}, "execution_count": 10 } ] }, { "cell_type": "code", "source": [ "df.columns= df.columns.str.upper()" ], "metadata": { "id": "gUNwwxNEzYyl" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "df.columns" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "vOOK_EFSz5y6", "outputId": "d8dc3d84-50a8-417a-d983-252a62f0d14c" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "Index(['1', 'MOVIE_TITLE', 'NUM_CRITIC_FOR_REVIEWS', 'DURATION',\n", " 'DIRECTOR_FACEBOOK_LIKES', 'ACTOR_3_FACEBOOK_LIKES',\n", " 'ACTOR_1_FACEBOOK_LIKES', 'GROSS', 'NUM_VOTED_USERS',\n", " 'CAST_TOTAL_FACEBOOK_LIKES', 'FACENUMBER_IN_POSTER',\n", " 'NUM_USER_FOR_REVIEWS', 'BUDGET', 'TITLE_YEAR',\n", " 'ACTOR_2_FACEBOOK_LIKES', 'IMDB_SCORE', 'TITLE_YEAR.1'],\n", " dtype='object')" ] }, "metadata": {}, "execution_count": 12 } ] }, { "cell_type": "markdown", "source": [ "Rename column" ], "metadata": { "id": "CXHjH9bu0RUn" } }, { "cell_type": "code", "source": [ "df.rename(columns={'1':'S.N'})" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 890 }, "id": "MsPfypYH0UeO", "outputId": "51405ce9-09e4-4253-a532-8c87b94d930f" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " S.N MOVIE_TITLE NUM_CRITIC_FOR_REVIEWS \\\n", "0 2 Avatar?ÿ 723 \n", "1 3 Pirates of the Caribbean: At World's End?ÿ 302 \n", "2 4 Spectre?ÿ 602 \n", "3 5 The Dark Knight Rises?ÿ 813 \n", "4 6 John Carter?ÿ 462 \n", "5 7 Spider-Man 3?ÿ 392 \n", "6 8 Tangled?ÿ 324 \n", "7 9 Avengers: Age of Ultron?ÿ 635 \n", "8 10 Avengers: Age of Ultron?ÿ 635 \n", "9 11 Harry Potter and the Half-Blood Prince?ÿ 375 \n", "10 12 Batman v Superman: Dawn of Justice?ÿ 673 \n", "11 13 Superman Returns?ÿ 434 \n", "12 14 Quantum of Solace?ÿ 403 \n", "13 15 Pirates of the Caribbean: Dead Man's Chest?ÿ 313 \n", "\n", " DURATION DIRECTOR_FACEBOOK_LIKES ACTOR_3_FACEBOOK_LIKES \\\n", "0 178.0 10 855 \n", "1 NaN 563 1000 \n", "2 148.0 20 161 \n", "3 NaN 22000 23000 \n", "4 132.0 \"475\" 530 \n", "5 156.0 23 4000 \n", "6 NaN 15 284 \n", "7 141.0 10 19000 \n", "8 141.0 10 19000 \n", "9 153.0 282 10000 \n", "10 183.0 NaN 2000 \n", "11 169.0 NaN 903 \n", "12 106.0 395 393 \n", "13 151.0 563 1000 \n", "\n", " ACTOR_1_FACEBOOK_LIKES GROSS NUM_VOTED_USERS \\\n", "0 1000 760505847 886204.0 \n", "1 40000 309404152 471220.0 \n", "2 11000 200074175 275868.0 \n", "3 27000 448130642 1144337.0 \n", "4 640 73058679 212204.0 \n", "5 24000 336530303 383056.0 \n", "6 799 200807262 294810.0 \n", "7 26000 458991599 462669.0 \n", "8 26000 458991599 462669.0 \n", "9 25000 301956980 321795.0 \n", "10 15000 330249062 NaN \n", "11 18000 200069408 240396.0 \n", "12 451 168368427 330784.0 \n", "13 40000 423032628 522040.0 \n", "\n", " CAST_TOTAL_FACEBOOK_LIKES FACENUMBER_IN_POSTER NUM_USER_FOR_REVIEWS \\\n", "0 4834.0 NaN 3054 \n", "1 48350.0 NaN 1238 \n", "2 11700.0 1.0 994 \n", "3 106759.0 NaN 2701 \n", "4 1873.0 1.0 738 \n", "5 46055.0 NaN 1902 \n", "6 NaN 1.0 387 \n", "7 92000.0 4.0 1117 \n", "8 92000.0 4.0 1117 \n", "9 58753.0 3.0 973 \n", "10 24450.0 NaN 3018 \n", "11 NaN 2.0 2367 \n", "12 2023.0 1.0 1243 \n", "13 48486.0 2.0 1832 \n", "\n", " BUDGET TITLE_YEAR ACTOR_2_FACEBOOK_LIKES IMDB_SCORE TITLE_YEAR.1 \n", "0 237000000 2009 936.0 7.9 2009.0 \n", "1 300000000 2007 5000.0 7.1 NaN \n", "2 245000000 2015 393.0 6.8 2015.0 \n", "3 250000000 2012 23000.0 8.5 NaN \n", "4 263700000 2012 632.0 6.6 NaN \n", "5 258000000 2007 11000.0 6.2 2007.0 \n", "6 260000000 2010 553.0 7.8 NaN \n", "7 250000000 2015 21000.0 7.5 NaN \n", "8 250000000 2015 21000.0 7.5 2015.0 \n", "9 250000000 2009 11000.0 7.5 NaN \n", "10 250000000 2016 NaN 6.9 2016.0 \n", "11 209000000 2006 10000.0 6.1 NaN \n", "12 200000000 2008 412.0 6.7 2008.0 \n", "13 225000000 2006 5000.0 7.3 2008.0 " ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
S.NMOVIE_TITLENUM_CRITIC_FOR_REVIEWSDURATIONDIRECTOR_FACEBOOK_LIKESACTOR_3_FACEBOOK_LIKESACTOR_1_FACEBOOK_LIKESGROSSNUM_VOTED_USERSCAST_TOTAL_FACEBOOK_LIKESFACENUMBER_IN_POSTERNUM_USER_FOR_REVIEWSBUDGETTITLE_YEARACTOR_2_FACEBOOK_LIKESIMDB_SCORETITLE_YEAR.1
02Avatar?ÿ723178.0108551000760505847886204.04834.0NaN30542370000002009936.07.92009.0
13Pirates of the Caribbean: At World's End?ÿ302NaN563100040000309404152471220.048350.0NaN123830000000020075000.07.1NaN
24Spectre?ÿ602148.02016111000200074175275868.011700.01.09942450000002015393.06.82015.0
35The Dark Knight Rises?ÿ813NaN2200023000270004481306421144337.0106759.0NaN2701250000000201223000.08.5NaN
46John Carter?ÿ462132.0\"475\"53064073058679212204.01873.01.07382637000002012632.06.6NaN
57Spider-Man 3?ÿ392156.023400024000336530303383056.046055.0NaN1902258000000200711000.06.22007.0
68Tangled?ÿ324NaN15284799200807262294810.0NaN1.03872600000002010553.07.8NaN
79Avengers: Age of Ultron?ÿ635141.0101900026000458991599462669.092000.04.01117250000000201521000.07.5NaN
810Avengers: Age of Ultron?ÿ635141.0101900026000458991599462669.092000.04.01117250000000201521000.07.52015.0
911Harry Potter and the Half-Blood Prince?ÿ375153.02821000025000301956980321795.058753.03.0973250000000200911000.07.5NaN
1012Batman v Superman: Dawn of Justice?ÿ673183.0NaN200015000330249062NaN24450.0NaN30182500000002016NaN6.92016.0
1113Superman Returns?ÿ434169.0NaN90318000200069408240396.0NaN2.02367209000000200610000.06.1NaN
1214Quantum of Solace?ÿ403106.0395393451168368427330784.02023.01.012432000000002008412.06.72008.0
1315Pirates of the Caribbean: Dead Man's Chest?ÿ313151.0563100040000423032628522040.048486.02.0183222500000020065000.07.32008.0
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 13 } ] }, { "cell_type": "markdown", "source": [ "Missing Value\n", "\n", "Add a default value for missing data or use mean to fill it\n", "\n", "Delete the row and column with missing data\n", "\n", "Interpolate the rows\n", "\n", "Replace" ], "metadata": { "id": "KeaTGQY_2fjb" } }, { "cell_type": "markdown", "source": [ "To check for missing data" ], "metadata": { "id": "1PMEFzFs3SiG" } }, { "cell_type": "markdown", "source": [ "False means no missing data\n", "\n", "df.isnull().sum() int\n", "\n", "df.isnull().any() bool" ], "metadata": { "id": "fsy0U-gx3rwM" } }, { "cell_type": "code", "source": [ "df.isnull()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 508 }, "id": "sVNvH-q74TYR", "outputId": "72f15175-8478-4212-eb53-9c4ebbe5f8ed" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " 1 MOVIE_TITLE NUM_CRITIC_FOR_REVIEWS DURATION \\\n", "0 False False False False \n", "1 False False False True \n", "2 False False False False \n", "3 False False False True \n", "4 False False False False \n", "5 False False False False \n", "6 False False False True \n", "7 False False False False \n", "8 False False False False \n", "9 False False False False \n", "10 False False False False \n", "11 False False False False \n", "12 False False False False \n", "13 False False False False \n", "\n", " DIRECTOR_FACEBOOK_LIKES ACTOR_3_FACEBOOK_LIKES ACTOR_1_FACEBOOK_LIKES \\\n", "0 False False False \n", "1 False False False \n", "2 False False False \n", "3 False False False \n", "4 False False False \n", "5 False False False \n", "6 False False False \n", "7 False False False \n", "8 False False False \n", "9 False False False \n", "10 True False False \n", "11 True False False \n", "12 False False False \n", "13 False False False \n", "\n", " GROSS NUM_VOTED_USERS CAST_TOTAL_FACEBOOK_LIKES FACENUMBER_IN_POSTER \\\n", "0 False False False True \n", "1 False False False True \n", "2 False False False False \n", "3 False False False True \n", "4 False False False False \n", "5 False False False True \n", "6 False False True False \n", "7 False False False False \n", "8 False False False False \n", "9 False False False False \n", "10 False True False True \n", "11 False False True False \n", "12 False False False False \n", "13 False False False False \n", "\n", " NUM_USER_FOR_REVIEWS BUDGET TITLE_YEAR ACTOR_2_FACEBOOK_LIKES \\\n", "0 False False False False \n", "1 False False False False \n", "2 False False False False \n", "3 False False False False \n", "4 False False False False \n", "5 False False False False \n", "6 False False False False \n", "7 False False False False \n", "8 False False False False \n", "9 False False False False \n", "10 False False False True \n", "11 False False False False \n", "12 False False False False \n", "13 False False False False \n", "\n", " IMDB_SCORE TITLE_YEAR.1 \n", "0 False False \n", "1 False True \n", "2 False False \n", "3 False True \n", "4 False True \n", "5 False False \n", "6 False True \n", "7 False True \n", "8 False False \n", "9 False True \n", "10 False False \n", "11 False True \n", "12 False False \n", "13 False False " ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
1MOVIE_TITLENUM_CRITIC_FOR_REVIEWSDURATIONDIRECTOR_FACEBOOK_LIKESACTOR_3_FACEBOOK_LIKESACTOR_1_FACEBOOK_LIKESGROSSNUM_VOTED_USERSCAST_TOTAL_FACEBOOK_LIKESFACENUMBER_IN_POSTERNUM_USER_FOR_REVIEWSBUDGETTITLE_YEARACTOR_2_FACEBOOK_LIKESIMDB_SCORETITLE_YEAR.1
0FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseTrueFalseFalseFalseFalseFalseFalse
1FalseFalseFalseTrueFalseFalseFalseFalseFalseFalseTrueFalseFalseFalseFalseFalseTrue
2FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
3FalseFalseFalseTrueFalseFalseFalseFalseFalseFalseTrueFalseFalseFalseFalseFalseTrue
4FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseTrue
5FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseTrueFalseFalseFalseFalseFalseFalse
6FalseFalseFalseTrueFalseFalseFalseFalseFalseTrueFalseFalseFalseFalseFalseFalseTrue
7FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseTrue
8FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
9FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseTrue
10FalseFalseFalseFalseTrueFalseFalseFalseTrueFalseTrueFalseFalseFalseTrueFalseFalse
11FalseFalseFalseFalseTrueFalseFalseFalseFalseTrueFalseFalseFalseFalseFalseFalseTrue
12FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
13FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 14 } ] }, { "cell_type": "code", "source": [ "df.isnull().any()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "e-sCeRAz4WhW", "outputId": "4e0c0b4d-4973-4c88-98e8-a3c850f3645f" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "1 False\n", "MOVIE_TITLE False\n", "NUM_CRITIC_FOR_REVIEWS False\n", "DURATION True\n", "DIRECTOR_FACEBOOK_LIKES True\n", "ACTOR_3_FACEBOOK_LIKES False\n", "ACTOR_1_FACEBOOK_LIKES False\n", "GROSS False\n", "NUM_VOTED_USERS True\n", "CAST_TOTAL_FACEBOOK_LIKES True\n", "FACENUMBER_IN_POSTER True\n", "NUM_USER_FOR_REVIEWS False\n", "BUDGET False\n", "TITLE_YEAR False\n", "ACTOR_2_FACEBOOK_LIKES True\n", "IMDB_SCORE False\n", "TITLE_YEAR.1 True\n", "dtype: bool" ] }, "metadata": {}, "execution_count": 15 } ] }, { "cell_type": "code", "source": [ "df.isnull().any().any()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "SS_Si-Xp4lnS", "outputId": "fe6d7859-748b-4b35-a3fb-eb4e484b4635" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "True" ] }, "metadata": {}, "execution_count": 16 } ] }, { "cell_type": "markdown", "source": [ "Missing-How much value missing in each column" ], "metadata": { "id": "2-NwpcPD5T-j" } }, { "cell_type": "code", "source": [ "df.isnull().sum()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "PxOyVAyb5ET1", "outputId": "10dff513-354f-4947-eb97-e285451b4775" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "1 0\n", "MOVIE_TITLE 0\n", "NUM_CRITIC_FOR_REVIEWS 0\n", "DURATION 3\n", "DIRECTOR_FACEBOOK_LIKES 2\n", "ACTOR_3_FACEBOOK_LIKES 0\n", "ACTOR_1_FACEBOOK_LIKES 0\n", "GROSS 0\n", "NUM_VOTED_USERS 1\n", "CAST_TOTAL_FACEBOOK_LIKES 2\n", "FACENUMBER_IN_POSTER 5\n", "NUM_USER_FOR_REVIEWS 0\n", "BUDGET 0\n", "TITLE_YEAR 0\n", "ACTOR_2_FACEBOOK_LIKES 1\n", "IMDB_SCORE 0\n", "TITLE_YEAR.1 7\n", "dtype: int64" ] }, "metadata": {}, "execution_count": 17 } ] }, { "cell_type": "markdown", "source": [ "Total number missing values in dataset" ], "metadata": { "id": "JUV5wYkm5sK4" } }, { "cell_type": "code", "source": [ "df.isnull().sum().sum()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "VQHCG0mk5eNn", "outputId": "30b1b121-6b0b-4721-e9b5-4a9bb43233da" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "21" ] }, "metadata": {}, "execution_count": 18 } ] }, { "cell_type": "markdown", "source": [ "Adding a default value or missing the missing value- fisrt check the missing value NaN" ], "metadata": { "id": "Bqcw7Grc6D0s" } }, { "cell_type": "code", "source": [ "df.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 330 }, "id": "3MwqmbcL576U", "outputId": "b8528f11-181c-40e9-c29e-500200c5fb75" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " 1 MOVIE_TITLE NUM_CRITIC_FOR_REVIEWS \\\n", "0 2 Avatar?ÿ 723 \n", "1 3 Pirates of the Caribbean: At World's End?ÿ 302 \n", "2 4 Spectre?ÿ 602 \n", "3 5 The Dark Knight Rises?ÿ 813 \n", "4 6 John Carter?ÿ 462 \n", "\n", " DURATION DIRECTOR_FACEBOOK_LIKES ACTOR_3_FACEBOOK_LIKES \\\n", "0 178.0 10 855 \n", "1 NaN 563 1000 \n", "2 148.0 20 161 \n", "3 NaN 22000 23000 \n", "4 132.0 \"475\" 530 \n", "\n", " ACTOR_1_FACEBOOK_LIKES GROSS NUM_VOTED_USERS \\\n", "0 1000 760505847 886204.0 \n", "1 40000 309404152 471220.0 \n", "2 11000 200074175 275868.0 \n", "3 27000 448130642 1144337.0 \n", "4 640 73058679 212204.0 \n", "\n", " CAST_TOTAL_FACEBOOK_LIKES FACENUMBER_IN_POSTER NUM_USER_FOR_REVIEWS \\\n", "0 4834.0 NaN 3054 \n", "1 48350.0 NaN 1238 \n", "2 11700.0 1.0 994 \n", "3 106759.0 NaN 2701 \n", "4 1873.0 1.0 738 \n", "\n", " BUDGET TITLE_YEAR ACTOR_2_FACEBOOK_LIKES IMDB_SCORE TITLE_YEAR.1 \n", "0 237000000 2009 936.0 7.9 2009.0 \n", "1 300000000 2007 5000.0 7.1 NaN \n", "2 245000000 2015 393.0 6.8 2015.0 \n", "3 250000000 2012 23000.0 8.5 NaN \n", "4 263700000 2012 632.0 6.6 NaN " ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
1MOVIE_TITLENUM_CRITIC_FOR_REVIEWSDURATIONDIRECTOR_FACEBOOK_LIKESACTOR_3_FACEBOOK_LIKESACTOR_1_FACEBOOK_LIKESGROSSNUM_VOTED_USERSCAST_TOTAL_FACEBOOK_LIKESFACENUMBER_IN_POSTERNUM_USER_FOR_REVIEWSBUDGETTITLE_YEARACTOR_2_FACEBOOK_LIKESIMDB_SCORETITLE_YEAR.1
02Avatar?ÿ723178.0108551000760505847886204.04834.0NaN30542370000002009936.07.92009.0
13Pirates of the Caribbean: At World's End?ÿ302NaN563100040000309404152471220.048350.0NaN123830000000020075000.07.1NaN
24Spectre?ÿ602148.02016111000200074175275868.011700.01.09942450000002015393.06.82015.0
35The Dark Knight Rises?ÿ813NaN2200023000270004481306421144337.0106759.0NaN2701250000000201223000.08.5NaN
46John Carter?ÿ462132.0\"475\"53064073058679212204.01873.01.07382637000002012632.06.6NaN
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 19 } ] }, { "cell_type": "code", "source": [ "df_with_0=df.fillna(0)\n" ], "metadata": { "id": "J9jQVdWd6iCm" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "df_with_0.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 330 }, "id": "G2rZuIsW667p", "outputId": "d46c2f52-c293-4bb2-bba3-c6c44ad25323" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " 1 MOVIE_TITLE NUM_CRITIC_FOR_REVIEWS \\\n", "0 2 Avatar?ÿ 723 \n", "1 3 Pirates of the Caribbean: At World's End?ÿ 302 \n", "2 4 Spectre?ÿ 602 \n", "3 5 The Dark Knight Rises?ÿ 813 \n", "4 6 John Carter?ÿ 462 \n", "\n", " DURATION DIRECTOR_FACEBOOK_LIKES ACTOR_3_FACEBOOK_LIKES \\\n", "0 178.0 10 855 \n", "1 0.0 563 1000 \n", "2 148.0 20 161 \n", "3 0.0 22000 23000 \n", "4 132.0 \"475\" 530 \n", "\n", " ACTOR_1_FACEBOOK_LIKES GROSS NUM_VOTED_USERS \\\n", "0 1000 760505847 886204.0 \n", "1 40000 309404152 471220.0 \n", "2 11000 200074175 275868.0 \n", "3 27000 448130642 1144337.0 \n", "4 640 73058679 212204.0 \n", "\n", " CAST_TOTAL_FACEBOOK_LIKES FACENUMBER_IN_POSTER NUM_USER_FOR_REVIEWS \\\n", "0 4834.0 0.0 3054 \n", "1 48350.0 0.0 1238 \n", "2 11700.0 1.0 994 \n", "3 106759.0 0.0 2701 \n", "4 1873.0 1.0 738 \n", "\n", " BUDGET TITLE_YEAR ACTOR_2_FACEBOOK_LIKES IMDB_SCORE TITLE_YEAR.1 \n", "0 237000000 2009 936.0 7.9 2009.0 \n", "1 300000000 2007 5000.0 7.1 0.0 \n", "2 245000000 2015 393.0 6.8 2015.0 \n", "3 250000000 2012 23000.0 8.5 0.0 \n", "4 263700000 2012 632.0 6.6 0.0 " ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
1MOVIE_TITLENUM_CRITIC_FOR_REVIEWSDURATIONDIRECTOR_FACEBOOK_LIKESACTOR_3_FACEBOOK_LIKESACTOR_1_FACEBOOK_LIKESGROSSNUM_VOTED_USERSCAST_TOTAL_FACEBOOK_LIKESFACENUMBER_IN_POSTERNUM_USER_FOR_REVIEWSBUDGETTITLE_YEARACTOR_2_FACEBOOK_LIKESIMDB_SCORETITLE_YEAR.1
02Avatar?ÿ723178.0108551000760505847886204.04834.00.030542370000002009936.07.92009.0
13Pirates of the Caribbean: At World's End?ÿ3020.0563100040000309404152471220.048350.00.0123830000000020075000.07.10.0
24Spectre?ÿ602148.02016111000200074175275868.011700.01.09942450000002015393.06.82015.0
35The Dark Knight Rises?ÿ8130.02200023000270004481306421144337.0106759.00.02701250000000201223000.08.50.0
46John Carter?ÿ462132.0\"475\"53064073058679212204.01873.01.07382637000002012632.06.60.0
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 21 } ] }, { "cell_type": "code", "source": [ "df['DURATION']\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "r7S5MHDM7EHh", "outputId": "42a88d81-bd11-417b-f4f8-354406ced212" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0 178.0\n", "1 NaN\n", "2 148.0\n", "3 NaN\n", "4 132.0\n", "5 156.0\n", "6 NaN\n", "7 141.0\n", "8 141.0\n", "9 153.0\n", "10 183.0\n", "11 169.0\n", "12 106.0\n", "13 151.0\n", "Name: DURATION, dtype: float64" ] }, "metadata": {}, "execution_count": 23 } ] }, { "cell_type": "code", "source": [ "df['DURATION'].mean()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "tSXOaVhG7ipP", "outputId": "c2092f3b-61e5-4fdc-a7d0-d0ce6d2dc3bb" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "150.72727272727272" ] }, "metadata": {}, "execution_count": 24 } ] }, { "cell_type": "code", "source": [ "df_with_mean=df.DURATION.fillna(df['DURATION'].mean())" ], "metadata": { "id": "cEc9d0_77wWh" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "df_with_mean" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "57lTenrl8br5", "outputId": "9f3824f0-42aa-4e8b-b5d9-0e4e8117d2cf" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0 178.000000\n", "1 150.727273\n", "2 148.000000\n", "3 150.727273\n", "4 132.000000\n", "5 156.000000\n", "6 150.727273\n", "7 141.000000\n", "8 141.000000\n", "9 153.000000\n", "10 183.000000\n", "11 169.000000\n", "12 106.000000\n", "13 151.000000\n", "Name: DURATION, dtype: float64" ] }, "metadata": {}, "execution_count": 29 } ] }, { "cell_type": "markdown", "source": [ "Drop NA" ], "metadata": { "id": "tOHs43TH9k_2" } }, { "cell_type": "code", "source": [ "df.isnull().sum().sum()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "BGsZ6CYR8eqb", "outputId": "0623b132-3fcf-40f0-d276-29533b05bc86" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "21" ] }, "metadata": {}, "execution_count": 30 } ] }, { "cell_type": "code", "source": [ "df.shape" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "FjCJ-teR9_1q", "outputId": "f1e456c8-4339-483f-8c41-085ce9a97e3f" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(14, 17)" ] }, "metadata": {}, "execution_count": 31 } ] }, { "cell_type": "code", "source": [ "df_drop = df.dropna()" ], "metadata": { "id": "TVVUTyek-Dyh" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "df_drop.shape" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "82AJpoXj-dYA", "outputId": "f755775e-2981-4f0c-c321-3210c79902b9" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(4, 17)" ] }, "metadata": {}, "execution_count": 34 } ] }, { "cell_type": "code", "source": [ "df_drop" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 299 }, "id": "vhKmZmV5_IJX", "outputId": "3e99e62f-a96d-4c1c-ea52-c1c4d334eba1" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " 1 MOVIE_TITLE NUM_CRITIC_FOR_REVIEWS \\\n", "2 4 Spectre?ÿ 602 \n", "8 10 Avengers: Age of Ultron?ÿ 635 \n", "12 14 Quantum of Solace?ÿ 403 \n", "13 15 Pirates of the Caribbean: Dead Man's Chest?ÿ 313 \n", "\n", " DURATION DIRECTOR_FACEBOOK_LIKES ACTOR_3_FACEBOOK_LIKES \\\n", "2 148.0 20 161 \n", "8 141.0 10 19000 \n", "12 106.0 395 393 \n", "13 151.0 563 1000 \n", "\n", " ACTOR_1_FACEBOOK_LIKES GROSS NUM_VOTED_USERS \\\n", "2 11000 200074175 275868.0 \n", "8 26000 458991599 462669.0 \n", "12 451 168368427 330784.0 \n", "13 40000 423032628 522040.0 \n", "\n", " CAST_TOTAL_FACEBOOK_LIKES FACENUMBER_IN_POSTER NUM_USER_FOR_REVIEWS \\\n", "2 11700.0 1.0 994 \n", "8 92000.0 4.0 1117 \n", "12 2023.0 1.0 1243 \n", "13 48486.0 2.0 1832 \n", "\n", " BUDGET TITLE_YEAR ACTOR_2_FACEBOOK_LIKES IMDB_SCORE TITLE_YEAR.1 \n", "2 245000000 2015 393.0 6.8 2015.0 \n", "8 250000000 2015 21000.0 7.5 2015.0 \n", "12 200000000 2008 412.0 6.7 2008.0 \n", "13 225000000 2006 5000.0 7.3 2008.0 " ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
1MOVIE_TITLENUM_CRITIC_FOR_REVIEWSDURATIONDIRECTOR_FACEBOOK_LIKESACTOR_3_FACEBOOK_LIKESACTOR_1_FACEBOOK_LIKESGROSSNUM_VOTED_USERSCAST_TOTAL_FACEBOOK_LIKESFACENUMBER_IN_POSTERNUM_USER_FOR_REVIEWSBUDGETTITLE_YEARACTOR_2_FACEBOOK_LIKESIMDB_SCORETITLE_YEAR.1
24Spectre?ÿ602148.02016111000200074175275868.011700.01.09942450000002015393.06.82015.0
810Avengers: Age of Ultron?ÿ635141.0101900026000458991599462669.092000.04.01117250000000201521000.07.52015.0
1214Quantum of Solace?ÿ403106.0395393451168368427330784.02023.01.012432000000002008412.06.72008.0
1315Pirates of the Caribbean: Dead Man's Chest?ÿ313151.0563100040000423032628522040.048486.02.0183222500000020065000.07.32008.0
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 35 } ] }, { "cell_type": "code", "source": [ "df_drop_with_condition = df.dropna(how=\"any\")" ], "metadata": { "id": "rImnEqiV-gRI" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "df_drop_with_condition\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 299 }, "id": "8oB93czkLPXO", "outputId": "2b94717e-fa9b-429a-94e6-2700a2753945" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " 1 MOVIE_TITLE NUM_CRITIC_FOR_REVIEWS \\\n", "2 4 Spectre?ÿ 602 \n", "8 10 Avengers: Age of Ultron?ÿ 635 \n", "12 14 Quantum of Solace?ÿ 403 \n", "13 15 Pirates of the Caribbean: Dead Man's Chest?ÿ 313 \n", "\n", " DURATION DIRECTOR_FACEBOOK_LIKES ACTOR_3_FACEBOOK_LIKES \\\n", "2 148.0 20 161 \n", "8 141.0 10 19000 \n", "12 106.0 395 393 \n", "13 151.0 563 1000 \n", "\n", " ACTOR_1_FACEBOOK_LIKES GROSS NUM_VOTED_USERS \\\n", "2 11000 200074175 275868.0 \n", "8 26000 458991599 462669.0 \n", "12 451 168368427 330784.0 \n", "13 40000 423032628 522040.0 \n", "\n", " CAST_TOTAL_FACEBOOK_LIKES FACENUMBER_IN_POSTER NUM_USER_FOR_REVIEWS \\\n", "2 11700.0 1.0 994 \n", "8 92000.0 4.0 1117 \n", "12 2023.0 1.0 1243 \n", "13 48486.0 2.0 1832 \n", "\n", " BUDGET TITLE_YEAR ACTOR_2_FACEBOOK_LIKES IMDB_SCORE TITLE_YEAR.1 \n", "2 245000000 2015 393.0 6.8 2015.0 \n", "8 250000000 2015 21000.0 7.5 2015.0 \n", "12 200000000 2008 412.0 6.7 2008.0 \n", "13 225000000 2006 5000.0 7.3 2008.0 " ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
1MOVIE_TITLENUM_CRITIC_FOR_REVIEWSDURATIONDIRECTOR_FACEBOOK_LIKESACTOR_3_FACEBOOK_LIKESACTOR_1_FACEBOOK_LIKESGROSSNUM_VOTED_USERSCAST_TOTAL_FACEBOOK_LIKESFACENUMBER_IN_POSTERNUM_USER_FOR_REVIEWSBUDGETTITLE_YEARACTOR_2_FACEBOOK_LIKESIMDB_SCORETITLE_YEAR.1
24Spectre?ÿ602148.02016111000200074175275868.011700.01.09942450000002015393.06.82015.0
810Avengers: Age of Ultron?ÿ635141.0101900026000458991599462669.092000.04.01117250000000201521000.07.52015.0
1214Quantum of Solace?ÿ403106.0395393451168368427330784.02023.01.012432000000002008412.06.72008.0
1315Pirates of the Caribbean: Dead Man's Chest?ÿ313151.0563100040000423032628522040.048486.02.0183222500000020065000.07.32008.0
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 39 } ] }, { "cell_type": "code", "source": [ "df.shape" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "0Uz7j5hEQG8H", "outputId": "9408c7af-2fa9-4ea9-864f-48963bb0ea6c" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(14, 17)" ] }, "metadata": {}, "execution_count": 40 } ] }, { "cell_type": "code", "source": [ "df_drop_column = df.dropna(axis=1)" ], "metadata": { "id": "Y14yOzYnQaEi" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "df_drop_column.shape" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "GVBFRJSvQmMS", "outputId": "32334d89-96f4-4905-a6e2-881a3c15d270" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(14, 10)" ] }, "metadata": {}, "execution_count": 42 } ] }, { "cell_type": "markdown", "source": [ "Dropping Duplicate\n", "\n", "drop_duplicates\n", "\n", "keep=\"first\"" ], "metadata": { "id": "JmG-3JH--7io" } }, { "cell_type": "code", "source": [ "df=pd.read_csv(\"unclean_data2.csv\")" ], "metadata": { "id": "UKa82pjsRAfo" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "df.head(10)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 626 }, "id": "fIZJmDDTRNEK", "outputId": "9f7b0eae-8bb0-4961-b8fa-ca85ce6126b0" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " 1 movie_title num_critic_for_reviews \\\n", "0 2 Avatar?ÿ 723 \n", "1 3 Pirates of the Caribbean: At World's End?ÿ 302 \n", "2 4 Spectre?ÿ 602 \n", "3 5 The Dark Knight Rises?ÿ 813 \n", "4 6 John Carter?ÿ 462 \n", "5 7 Spider-Man 3?ÿ 392 \n", "6 8 Tangled?ÿ 324 \n", "7 9 Avengers: Age of Ultron?ÿ 635 \n", "8 10 Avengers: Age of Ultron?ÿ 635 \n", "9 11 Harry Potter and the Half-Blood Prince?ÿ 375 \n", "\n", " duration DIRECTOR_facebook_likes actor_3_facebook_likes \\\n", "0 178.0 10 855 \n", "1 NaN 563 1000 \n", "2 148.0 20 161 \n", "3 NaN 22000 23000 \n", "4 132.0 \"475\" 530 \n", "5 156.0 23 4000 \n", "6 NaN 15 284 \n", "7 141.0 10 19000 \n", "8 141.0 10 19000 \n", "9 153.0 282 10000 \n", "\n", " ACTOR_1_facebook_likes gross num_voted_users \\\n", "0 1000 760505847 886204.0 \n", "1 40000 309404152 471220.0 \n", "2 11000 200074175 275868.0 \n", "3 27000 448130642 1144337.0 \n", "4 640 73058679 212204.0 \n", "5 24000 336530303 383056.0 \n", "6 799 200807262 294810.0 \n", "7 26000 458991599 462669.0 \n", "8 26000 458991599 462669.0 \n", "9 25000 301956980 321795.0 \n", "\n", " Cast_Total_facebook_likes facenumber_in_poster num_user_for_reviews \\\n", "0 4834.0 NaN 3054 \n", "1 48350.0 NaN 1238 \n", "2 11700.0 1.0 994 \n", "3 106759.0 NaN 2701 \n", "4 1873.0 1.0 738 \n", "5 46055.0 NaN 1902 \n", "6 NaN 1.0 387 \n", "7 92000.0 4.0 1117 \n", "8 92000.0 4.0 1117 \n", "9 58753.0 3.0 973 \n", "\n", " budget title_year ACTOR_2_facebook_likes imdb_score title_year.1 \n", "0 237000000 2009 936.0 7.9 2009.0 \n", "1 300000000 2007 5000.0 7.1 NaN \n", "2 245000000 2015 393.0 6.8 2015.0 \n", "3 250000000 2012 23000.0 8.5 NaN \n", "4 263700000 2012 632.0 6.6 NaN \n", "5 258000000 2007 11000.0 6.2 2007.0 \n", "6 260000000 2010 553.0 7.8 NaN \n", "7 250000000 2015 21000.0 7.5 NaN \n", "8 250000000 2015 21000.0 7.5 2015.0 \n", "9 250000000 2009 11000.0 7.5 NaN " ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
1movie_titlenum_critic_for_reviewsdurationDIRECTOR_facebook_likesactor_3_facebook_likesACTOR_1_facebook_likesgrossnum_voted_usersCast_Total_facebook_likesfacenumber_in_posternum_user_for_reviewsbudgettitle_yearACTOR_2_facebook_likesimdb_scoretitle_year.1
02Avatar?ÿ723178.0108551000760505847886204.04834.0NaN30542370000002009936.07.92009.0
13Pirates of the Caribbean: At World's End?ÿ302NaN563100040000309404152471220.048350.0NaN123830000000020075000.07.1NaN
24Spectre?ÿ602148.02016111000200074175275868.011700.01.09942450000002015393.06.82015.0
35The Dark Knight Rises?ÿ813NaN2200023000270004481306421144337.0106759.0NaN2701250000000201223000.08.5NaN
46John Carter?ÿ462132.0\"475\"53064073058679212204.01873.01.07382637000002012632.06.6NaN
57Spider-Man 3?ÿ392156.023400024000336530303383056.046055.0NaN1902258000000200711000.06.22007.0
68Tangled?ÿ324NaN15284799200807262294810.0NaN1.03872600000002010553.07.8NaN
79Avengers: Age of Ultron?ÿ635141.0101900026000458991599462669.092000.04.01117250000000201521000.07.5NaN
810Avengers: Age of Ultron?ÿ635141.0101900026000458991599462669.092000.04.01117250000000201521000.07.52015.0
911Harry Potter and the Half-Blood Prince?ÿ375153.02821000025000301956980321795.058753.03.0973250000000200911000.07.5NaN
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 50 } ] }, { "cell_type": "code", "source": [ "df.duplicated()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "R84h3tk4Rkxx", "outputId": "af8867a8-319b-4969-fbd0-fef25736e8f1" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0 False\n", "1 False\n", "2 False\n", "3 False\n", "4 False\n", "5 False\n", "6 False\n", "7 False\n", "8 False\n", "9 False\n", "10 False\n", "11 False\n", "12 False\n", "13 False\n", "dtype: bool" ] }, "metadata": {}, "execution_count": 51 } ] }, { "cell_type": "code", "source": [ "df.duplicated('movie_title')" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ph6AsIEISMeu", "outputId": "d6fbbb44-767b-4768-8055-9e9864ed1df2" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0 False\n", "1 False\n", "2 False\n", "3 False\n", "4 False\n", "5 False\n", "6 False\n", "7 False\n", "8 True\n", "9 False\n", "10 False\n", "11 False\n", "12 False\n", "13 False\n", "dtype: bool" ] }, "metadata": {}, "execution_count": 53 } ] }, { "cell_type": "code", "source": [ "df.shape" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "9xS4OBoHSp76", "outputId": "dd8c1b05-f50c-4e2c-a411-f92e9bf7c20f" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(14, 17)" ] }, "metadata": {}, "execution_count": 54 } ] }, { "cell_type": "code", "source": [ "df_drop_dup = df.drop_duplicates()" ], "metadata": { "id": "LGzm7EouTm4u" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "df_drop_dup.shape" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "TnhSEBnGT2lN", "outputId": "cdee2eb7-0e8f-459c-a2d0-9c3a9cc11baa" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(14, 17)" ] }, "metadata": {}, "execution_count": 57 } ] }, { "cell_type": "code", "source": [ "df_drop_dup = df.drop_duplicates('movie_title')" ], "metadata": { "id": "bCSlP_p_UHif" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "df_drop_dup.shape" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "R9loEfUcUQH2", "outputId": "be5e13c6-7327-44b3-cf36-bccb7f69d912" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(13, 17)" ] }, "metadata": {}, "execution_count": 59 } ] }, { "cell_type": "markdown", "source": [ "Data Type Inconsistencies\n", "\n", "Change datatype after reading csv file\n", "\n", "Change datatype before reading the csv ++ pd.read_csv(url,dtype={'column1':float})" ], "metadata": { "id": "aGm7xMyfUbge" } }, { "cell_type": "code", "source": [ "df=pd.read_csv('unclean_data.csv',dtype={'column1':float})" ], "metadata": { "id": "SN8x9hDxUTVE" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "df.dtypes" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "0tGlg_NxV9Ok", "outputId": "a0dc50ba-d6d0-40ae-c625-2d23b28f58b3" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "1 int64\n", "movie_title object\n", "num_critic_for_reviews int64\n", "duration float64\n", "DIRECTOR_facebook_likes object\n", "actor_3_facebook_likes int64\n", "ACTOR_1_facebook_likes int64\n", "gross int64\n", "num_voted_users float64\n", "Cast_Total_facebook_likes float64\n", "facenumber_in_poster float64\n", "num_user_for_reviews int64\n", "budget int64\n", "title_year int64\n", "ACTOR_2_facebook_likes float64\n", "imdb_score float64\n", "title_year.1 float64\n", "dtype: object" ] }, "metadata": {}, "execution_count": 64 } ] }, { "cell_type": "code", "source": [ "df.gross.dtypes" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Xf3kW8J6WGMG", "outputId": "60999dba-ea24-466d-8af8-151f831db902" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "dtype('int64')" ] }, "metadata": {}, "execution_count": 65 } ] }, { "cell_type": "code", "source": [ "?df.astype" ], "metadata": { "id": "m218emmnWZyD" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "df.gross.astype(float).dtypes" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "n7Fl67eLWsub", "outputId": "efa50701-fa58-4e81-8ed0-49bb5cc50f8c" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "dtype('float64')" ] }, "metadata": {}, "execution_count": 68 } ] }, { "cell_type": "code", "source": [ "import numpy as np\n", "from scipy.stats import norm\n", "\n", "\n", "def hypothesis_test(sample, pop_mean,\n", "\t\t\t\t\talpha=0.05, two_tailed=True):\n", "\t# len sample dataset\n", "\tn = len(sample)\n", "\t# mean and stard-deviation of dataset\n", "\tsample_mean = np.mean(sample)\n", "\tsample_std = np.std(sample, ddof=1)\n", " #The ddof (Delta Degrees of Freedom) parameter in np. std()\n", " #allows adjusting the divisor used in the calculation of standard deviation. The default value is 0, which corresponds to dividing by N, the number of elements.\n", "\t# Calculate the test statistic\n", "\tz = (sample_mean - pop_mean) / (sample_std / np.sqrt(n))\n", "\n", "\t# Calculate the p-value based on the test type\n", "\tif two_tailed:\n", "\t\tp_value = 2 * (1 - norm.cdf(abs(z)))\n", " #Convert Z-score (Z-value, standard score) to p-value for normal distribution with cumulative distribution function in Python\n", " #The Cumulative Distribution Function or CDF is: The probability of all outcomes less than or equal to a given value x\n", "\telse:\n", "\t\tif z < 0:\n", "\t\t\tp_value = norm.cdf(z)\n", "\t\telse:\n", "\t\t\tp_value = 1 - norm.cdf(z)\n", "\n", "\t# Determine whether to reject or fail to\n", "\t# reject the null hypothesis\n", "\tif p_value < alpha:\n", "\t\tresult = \"reject\"\n", "\telse:\n", "\t\tresult = \"fail to reject\"\n", "\n", "\treturn z, p_value, result\n" ], "metadata": { "id": "BjR_DZHlW-4Y" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "np.random.seed(0)\n", "sample = np.random.normal(loc=4.5, scale=2, size=20)\n", "#numpy.random.normal(loc = 0.0, scale = 1.0, size = None) :\n", "#creates an array of specified shape and fills it with random values which is actually a part of Normal(Gaussian)Distribution\n", "#loc : [float or array_like]Mean of the distribution.\n", "#scale : [float or array_like]Standard Derivation of the distribution.\n", "#size : [int or int tuples].\n", "#Output shape given as (m, n, k) then m*n*k samples are drawn.\n", "# If size is None(by default), then a single value is returned.\n", "pop_mean = 5.0\n", "\n", "# Test the null hypothesis that\n", "# the population mean is equal to 5.0\n", "z, p_value, result = hypothesis_test(sample, pop_mean)\n", "\n", "print(f\"Test statistic: {z:.4f}\")\n", "print(f\"P-value: {p_value:.4f}\")\n", "print(f\"Result: {result} null hypothesis at alpha=0.05\")\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "7GHEfUKUTiSZ", "outputId": "b377b2ce-5fd0-4502-c154-f9f577bf7a5c" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Test statistic: 1.6372\n", "P-value: 0.1016\n", "Result: fail to reject null hypothesis at alpha=0.05\n" ] } ] }, { "cell_type": "code", "source": [], "metadata": { "id": "Ph3DVeO3T_xg" }, "execution_count": null, "outputs": [] } ] }