x
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "92f14f0d-75ca-4565-acc3-2dfc461a09fc",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Setting default log level to \"WARN\".\n",
"To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
"25/03/28 22:59:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"
]
}
],
"source": [
"from pyspark.sql import SparkSession\n",
"spark = (SparkSession.builder.appName(\"cs544\")\n",
" .master(\"spark://boss:7077\")\n",
" .config(\"spark.executor.memory\", \"1G\")\n",
" .config(\"spark.sql.warehouse.dir\", \"hdfs://nn:9000/user/hive/warehouse\")\n",
" .enableHiveSupport()\n",
" .getOrCreate())"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "115945dd-e62c-4cd4-a51e-5706c0a9d082",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"cp: `hdfs://nn:9000/problems.jsonl': File exists\n"
]
}
],
"source": [
"!hdfs dfs -cp data/problems.jsonl hdfs://nn:9000/problems.jsonl"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "904fd924-22a3-4bca-9579-40ae91f58cf1",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
" "
]
}
],
"source": [
"df = (spark.read.format(\"json\")\n",
" .load(\"hdfs://nn:9000/problems.jsonl\"))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "739f0dde-73c4-48db-aed5-8bc79c01b4d0",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[Stage 1:> (0 + 1) / 1]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-------------+--------+---------+---------+---------------+----------+---------------+-------------------------+------------------+--------------------+-------------+----------+------------+------+----------+\n",
"|cf_contest_id|cf_index|cf_points|cf_rating| cf_tags|difficulty|generated_tests|is_description_translated|memory_limit_bytes| name|private_tests|problem_id|public_tests|source|time_limit|\n",
"+-------------+--------+---------+---------+---------------+----------+---------------+-------------------------+------------------+--------------------+-------------+----------+------------+------+----------+\n",
"| 322| A| 500.0| 1000| [0]| 7| 93| false| 256000000|322_A. Ciel and D...| 45| 1| 2| 2| 1|\n",
"| 760| D| 1000.0| 1600| [1, 2]| 10| 51| false| 256000000| 760_D. Travel Card| 4| 2| 2| 2| 2|\n",
"| 569| E| 1500.0| 2600| [3, 0]| 11| 99| false| 256000000| 569_E. New Language| 17| 3| 3| 2| 2|\n",
"| 447| B| 1000.0| 1000| [0, 4]| 8| 100| false| 256000000|447_B. DZY Loves ...| 13| 4| 1| 2| 1|\n",
"| 1292| B| 750.0| 1700|[5, 6, 7, 0, 4]| 8| 91| false| 256000000|1292_B. Aroma's S...| 131| 5| 3| 2| 1|\n",
"+-------------+--------+---------+---------+---------------+----------+---------------+-------------------------+------------------+--------------------+-------------+----------+------------+------+----------+\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" "
]
}
],
"source": [
"df.limit(5).show()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "d9075280-aab5-4681-9d08-e3ec41b04ea8",
"metadata": {},
"outputs": [],
"source": [
"df.createOrReplaceTempView(\"problems\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "9a0eb6f5-7240-496c-8773-033462bede5e",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
" "
]
},
{
"data": {
"text/plain": [
"217"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#q1\n",
"spark.table(\"problems\").rdd.filter(\n",
" lambda row: row.cf_rating >= 1600 and row.private_tests > 0 and \"_A.\" in row.name\n",
").count()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "78ed631a-5f73-495a-b673-dde6b8dcba02",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"217"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#q2\n",
"from pyspark.sql.functions import expr, col\n",
"\n",
"(\n",
" spark.table(\"problems\")\n",
" .filter(expr(\"cf_rating >= 1600\"))\n",
" .filter(expr(\"private_tests > 0\"))\n",
" .filter(col(\"name\").contains(\"_A.\"))\n",
" .count()\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "cce715e4-227f-4d68-94ea-86233a9f072d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"217"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#q3\n",
"spark.sql(\"\"\"\n",
" SELECT COUNT(*)\n",
" FROM problems\n",
" WHERE cf_rating >= 1600\n",
" AND private_tests > 0\n",
" AND name LIKE '%_A.%'\n",
"\"\"\").collect()[0][0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d22eaf4c-4801-49c1-ae01-6af4fa2f06af",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}