p5.ipynb at 732bd89721a59b0cd238e35185fe47e3b104813e

ROOT /
nb /
p5.ipynb
History
237 lines | ISO-8859-1 | 6 KB
Blame
​x
 
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "92f14f0d-75ca-4565-acc3-2dfc461a09fc",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting default log level to \"WARN\".\n",
      "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
      "25/03/28 22:59:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"
     ]
    }
   ],
   "source": [
    "from pyspark.sql import SparkSession\n",
    "spark = (SparkSession.builder.appName(\"cs544\")\n",
    "         .master(\"spark://boss:7077\")\n",
    "         .config(\"spark.executor.memory\", \"1G\")\n",
    "         .config(\"spark.sql.warehouse.dir\", \"hdfs://nn:9000/user/hive/warehouse\")\n",
    "         .enableHiveSupport()\n",
    "         .getOrCreate())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "115945dd-e62c-4cd4-a51e-5706c0a9d082",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cp: `hdfs://nn:9000/problems.jsonl': File exists\n"
     ]
    }
   ],
   "source": [
    "!hdfs dfs -cp data/problems.jsonl hdfs://nn:9000/problems.jsonl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "904fd924-22a3-4bca-9579-40ae91f58cf1",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                                                "
     ]
    }
   ],
   "source": [
    "df = (spark.read.format(\"json\")\n",
    "               .load(\"hdfs://nn:9000/problems.jsonl\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "739f0dde-73c4-48db-aed5-8bc79c01b4d0",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Stage 1:>                                                          (0 + 1) / 1]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------------+--------+---------+---------+---------------+----------+---------------+-------------------------+------------------+--------------------+-------------+----------+------------+------+----------+\n",
      "|cf_contest_id|cf_index|cf_points|cf_rating|        cf_tags|difficulty|generated_tests|is_description_translated|memory_limit_bytes|                name|private_tests|problem_id|public_tests|source|time_limit|\n",
      "+-------------+--------+---------+---------+---------------+----------+---------------+-------------------------+------------------+--------------------+-------------+----------+------------+------+----------+\n",
      "|          322|       A|    500.0|     1000|            [0]|         7|             93|                    false|         256000000|322_A. Ciel and D...|           45|         1|           2|     2|         1|\n",
      "|          760|       D|   1000.0|     1600|         [1, 2]|        10|             51|                    false|         256000000|  760_D. Travel Card|            4|         2|           2|     2|         2|\n",
      "|          569|       E|   1500.0|     2600|         [3, 0]|        11|             99|                    false|         256000000| 569_E. New Language|           17|         3|           3|     2|         2|\n",
      "|          447|       B|   1000.0|     1000|         [0, 4]|         8|            100|                    false|         256000000|447_B. DZY Loves ...|           13|         4|           1|     2|         1|\n",
      "|         1292|       B|    750.0|     1700|[5, 6, 7, 0, 4]|         8|             91|                    false|         256000000|1292_B. Aroma's S...|          131|         5|           3|     2|         1|\n",
      "+-------------+--------+---------+---------+---------------+----------+---------------+-------------------------+------------------+--------------------+-------------+----------+------------+------+----------+\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                                                "
     ]
    }
   ],
   "source": [
    "df.limit(5).show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "d9075280-aab5-4681-9d08-e3ec41b04ea8",
   "metadata": {},
   "outputs": [],
   "source": [
    "df.createOrReplaceTempView(\"problems\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "9a0eb6f5-7240-496c-8773-033462bede5e",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                                                "
     ]
    },
    {
     "data": {
      "text/plain": [
       "217"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#q1\n",
    "spark.table(\"problems\").rdd.filter(\n",
    "    lambda row: row.cf_rating >= 1600 and row.private_tests > 0 and \"_A.\" in row.name\n",
    ").count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "78ed631a-5f73-495a-b673-dde6b8dcba02",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "217"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#q2\n",
    "from pyspark.sql.functions import expr, col\n",
    "\n",
    "(\n",
    "    spark.table(\"problems\")\n",
    "    .filter(expr(\"cf_rating >= 1600\"))\n",
    "    .filter(expr(\"private_tests > 0\"))\n",
    "    .filter(col(\"name\").contains(\"_A.\"))\n",
    "    .count()\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "cce715e4-227f-4d68-94ea-86233a9f072d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "217"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#q3\n",
    "spark.sql(\"\"\"\n",
    "    SELECT COUNT(*)\n",
    "    FROM problems\n",
    "    WHERE cf_rating >= 1600\n",
    "      AND private_tests > 0\n",
    "      AND name LIKE '%_A.%'\n",
    "\"\"\").collect()[0][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d22eaf4c-4801-49c1-ae01-6af4fa2f06af",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
​