ctrl k
Revision indexing in progress... (symbol navigation in revisions will be accurate after indexed)
  • .gitignore
    ■ ■ ■ ■ ■ ■
    skipped 5 lines
    6 6  **/__pycache__/
    7 7  **/score.json
    8 8  **/result.json
     9 +nb/data
     10 + 
     11 + 
  • boss.Dockerfile
    ■ ■ ■ ■ ■ ■
     1 +FROM p5-base
     2 + 
     3 +CMD [ "bash", "-c", "/spark-3.5.5-bin-hadoop3/sbin/start-master.sh && sleep infinity"]
     4 + 
  • nb/p5.ipynb
    ■ ■ ■ ■ ■ ■
     1 +{
     2 + "cells": [
     3 + {
     4 + "cell_type": "code",
     5 + "execution_count": 1,
     6 + "id": "92f14f0d-75ca-4565-acc3-2dfc461a09fc",
     7 + "metadata": {},
     8 + "outputs": [
     9 + {
     10 + "name": "stderr",
     11 + "output_type": "stream",
     12 + "text": [
     13 + "Setting default log level to \"WARN\".\n",
     14 + "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
     15 + "25/03/28 22:59:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"
     16 + ]
     17 + }
     18 + ],
     19 + "source": [
     20 + "from pyspark.sql import SparkSession\n",
     21 + "spark = (SparkSession.builder.appName(\"cs544\")\n",
     22 + " .master(\"spark://boss:7077\")\n",
     23 + " .config(\"spark.executor.memory\", \"1G\")\n",
     24 + " .config(\"spark.sql.warehouse.dir\", \"hdfs://nn:9000/user/hive/warehouse\")\n",
     25 + " .enableHiveSupport()\n",
     26 + " .getOrCreate())"
     27 + ]
     28 + },
     29 + {
     30 + "cell_type": "code",
     31 + "execution_count": 2,
     32 + "id": "115945dd-e62c-4cd4-a51e-5706c0a9d082",
     33 + "metadata": {},
     34 + "outputs": [
     35 + {
     36 + "name": "stdout",
     37 + "output_type": "stream",
     38 + "text": [
     39 + "cp: `hdfs://nn:9000/problems.jsonl': File exists\n"
     40 + ]
     41 + }
     42 + ],
     43 + "source": [
     44 + "!hdfs dfs -cp data/problems.jsonl hdfs://nn:9000/problems.jsonl"
     45 + ]
     46 + },
     47 + {
     48 + "cell_type": "code",
     49 + "execution_count": 3,
     50 + "id": "904fd924-22a3-4bca-9579-40ae91f58cf1",
     51 + "metadata": {},
     52 + "outputs": [
     53 + {
     54 + "name": "stderr",
     55 + "output_type": "stream",
     56 + "text": [
     57 + " "
     58 + ]
     59 + }
     60 + ],
     61 + "source": [
     62 + "df = (spark.read.format(\"json\")\n",
     63 + " .load(\"hdfs://nn:9000/problems.jsonl\"))"
     64 + ]
     65 + },
     66 + {
     67 + "cell_type": "code",
     68 + "execution_count": 4,
     69 + "id": "739f0dde-73c4-48db-aed5-8bc79c01b4d0",
     70 + "metadata": {},
     71 + "outputs": [
     72 + {
     73 + "name": "stderr",
     74 + "output_type": "stream",
     75 + "text": [
     76 + "[Stage 1:> (0 + 1) / 1]"
     77 + ]
     78 + },
     79 + {
     80 + "name": "stdout",
     81 + "output_type": "stream",
     82 + "text": [
     83 + "+-------------+--------+---------+---------+---------------+----------+---------------+-------------------------+------------------+--------------------+-------------+----------+------------+------+----------+\n",
     84 + "|cf_contest_id|cf_index|cf_points|cf_rating| cf_tags|difficulty|generated_tests|is_description_translated|memory_limit_bytes| name|private_tests|problem_id|public_tests|source|time_limit|\n",
     85 + "+-------------+--------+---------+---------+---------------+----------+---------------+-------------------------+------------------+--------------------+-------------+----------+------------+------+----------+\n",
     86 + "| 322| A| 500.0| 1000| [0]| 7| 93| false| 256000000|322_A. Ciel and D...| 45| 1| 2| 2| 1|\n",
     87 + "| 760| D| 1000.0| 1600| [1, 2]| 10| 51| false| 256000000| 760_D. Travel Card| 4| 2| 2| 2| 2|\n",
     88 + "| 569| E| 1500.0| 2600| [3, 0]| 11| 99| false| 256000000| 569_E. New Language| 17| 3| 3| 2| 2|\n",
     89 + "| 447| B| 1000.0| 1000| [0, 4]| 8| 100| false| 256000000|447_B. DZY Loves ...| 13| 4| 1| 2| 1|\n",
     90 + "| 1292| B| 750.0| 1700|[5, 6, 7, 0, 4]| 8| 91| false| 256000000|1292_B. Aroma's S...| 131| 5| 3| 2| 1|\n",
     91 + "+-------------+--------+---------+---------+---------------+----------+---------------+-------------------------+------------------+--------------------+-------------+----------+------------+------+----------+\n",
     92 + "\n"
     93 + ]
     94 + },
     95 + {
     96 + "name": "stderr",
     97 + "output_type": "stream",
     98 + "text": [
     99 + " "
     100 + ]
     101 + }
     102 + ],
     103 + "source": [
     104 + "df.limit(5).show()"
     105 + ]
     106 + },
     107 + {
     108 + "cell_type": "code",
     109 + "execution_count": 5,
     110 + "id": "d9075280-aab5-4681-9d08-e3ec41b04ea8",
     111 + "metadata": {},
     112 + "outputs": [],
     113 + "source": [
     114 + "df.createOrReplaceTempView(\"problems\")"
     115 + ]
     116 + },
     117 + {
     118 + "cell_type": "code",
     119 + "execution_count": 6,
     120 + "id": "9a0eb6f5-7240-496c-8773-033462bede5e",
     121 + "metadata": {},
     122 + "outputs": [
     123 + {
     124 + "name": "stderr",
     125 + "output_type": "stream",
     126 + "text": [
     127 + " "
     128 + ]
     129 + },
     130 + {
     131 + "data": {
     132 + "text/plain": [
     133 + "217"
     134 + ]
     135 + },
     136 + "execution_count": 6,
     137 + "metadata": {},
     138 + "output_type": "execute_result"
     139 + }
     140 + ],
     141 + "source": [
     142 + "#q1\n",
     143 + "spark.table(\"problems\").rdd.filter(\n",
     144 + " lambda row: row.cf_rating >= 1600 and row.private_tests > 0 and \"_A.\" in row.name\n",
     145 + ").count()"
     146 + ]
     147 + },
     148 + {
     149 + "cell_type": "code",
     150 + "execution_count": 7,
     151 + "id": "78ed631a-5f73-495a-b673-dde6b8dcba02",
     152 + "metadata": {},
     153 + "outputs": [
     154 + {
     155 + "data": {
     156 + "text/plain": [
     157 + "217"
     158 + ]
     159 + },
     160 + "execution_count": 7,
     161 + "metadata": {},
     162 + "output_type": "execute_result"
     163 + }
     164 + ],
     165 + "source": [
     166 + "#q2\n",
     167 + "from pyspark.sql.functions import expr, col\n",
     168 + "\n",
     169 + "(\n",
     170 + " spark.table(\"problems\")\n",
     171 + " .filter(expr(\"cf_rating >= 1600\"))\n",
     172 + " .filter(expr(\"private_tests > 0\"))\n",
     173 + " .filter(col(\"name\").contains(\"_A.\"))\n",
     174 + " .count()\n",
     175 + ")"
     176 + ]
     177 + },
     178 + {
     179 + "cell_type": "code",
     180 + "execution_count": 8,
     181 + "id": "cce715e4-227f-4d68-94ea-86233a9f072d",
     182 + "metadata": {},
     183 + "outputs": [
     184 + {
     185 + "data": {
     186 + "text/plain": [
     187 + "217"
     188 + ]
     189 + },
     190 + "execution_count": 8,
     191 + "metadata": {},
     192 + "output_type": "execute_result"
     193 + }
     194 + ],
     195 + "source": [
     196 + "#q3\n",
     197 + "spark.sql(\"\"\"\n",
     198 + " SELECT COUNT(*)\n",
     199 + " FROM problems\n",
     200 + " WHERE cf_rating >= 1600\n",
     201 + " AND private_tests > 0\n",
     202 + " AND name LIKE '%_A.%'\n",
     203 + "\"\"\").collect()[0][0]"
     204 + ]
     205 + },
     206 + {
     207 + "cell_type": "code",
     208 + "execution_count": null,
     209 + "id": "d22eaf4c-4801-49c1-ae01-6af4fa2f06af",
     210 + "metadata": {},
     211 + "outputs": [],
     212 + "source": []
     213 + }
     214 + ],
     215 + "metadata": {
     216 + "kernelspec": {
     217 + "display_name": "Python 3 (ipykernel)",
     218 + "language": "python",
     219 + "name": "python3"
     220 + },
     221 + "language_info": {
     222 + "codemirror_mode": {
     223 + "name": "ipython",
     224 + "version": 3
     225 + },
     226 + "file_extension": ".py",
     227 + "mimetype": "text/x-python",
     228 + "name": "python",
     229 + "nbconvert_exporter": "python",
     230 + "pygments_lexer": "ipython3",
     231 + "version": "3.10.12"
     232 + }
     233 + },
     234 + "nbformat": 4,
     235 + "nbformat_minor": 5
     236 +}
     237 + 
  • worker.Dockerfile
    ■ ■ ■ ■ ■ ■
     1 +FROM p5-base
     2 + 
     3 +CMD [ "bash", "-c", "/spark-3.5.5-bin-hadoop3/sbin/start-worker.sh spark://boss:7077 -c 2 -m 2g && sleep infinity"]
     4 + 
Page is in error, reload to recover