CS544/proj-5

Merge branch 'hli2225' into 'main'

Browse Code

Merge partial progress into main

See merge request cdis/cs/courses/cs544/s25/p5/p5_hli2225_jyan244!1

main
Hongzheng LI committed 2 weeks ago

732bd897

2 parents
b9540ce7
50bad451

Revision indexing in progress... (symbol navigation in revisions will be accurate after indexed)

.gitignore

■ ■ ■ ■ ■ ■

skipped 5 lines
6 6 **/__pycache__/
7 7 **/score.json
8 8 **/result.json
9 + nb/data
10 +
11 +

All occurrences
boss.Dockerfile

■ ■ ■ ■ ■ ■

1 + FROM p5-base
2 +
3 + CMD [ "bash", "-c", "/spark-3.5.5-bin-hadoop3/sbin/start-master.sh && sleep infinity"]
4 +

All occurrences

nb/p5.ipynb

■ ■ ■ ■ ■ ■

1	+	{
2	+	"cells": [
3	+	{
4	+	"cell_type": "code",
5	+	"execution_count": 1,
6	+	"id": "92f14f0d-75ca-4565-acc3-2dfc461a09fc",
7	+	"metadata": {},
8	+	"outputs": [
9	+	{
10	+	"name": "stderr",
11	+	"output_type": "stream",
12	+	"text": [
13	+	"Setting default log level to \"WARN\".\n",
14	+	"To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
15	+	"25/03/28 22:59:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"
16	+	]
17	+	}
18	+	],
19	+	"source": [
20	+	"from pyspark.sql import SparkSession\n",
21	+	"spark = (SparkSession.builder.appName(\"cs544\")\n",
22	+	" .master(\"spark://boss:7077\")\n",
23	+	" .config(\"spark.executor.memory\", \"1G\")\n",
24	+	" .config(\"spark.sql.warehouse.dir\", \"hdfs://nn:9000/user/hive/warehouse\")\n",
25	+	" .enableHiveSupport()\n",
26	+	" .getOrCreate())"
27	+	]
28	+	},
29	+	{
30	+	"cell_type": "code",
31	+	"execution_count": 2,
32	+	"id": "115945dd-e62c-4cd4-a51e-5706c0a9d082",
33	+	"metadata": {},
34	+	"outputs": [
35	+	{
36	+	"name": "stdout",
37	+	"output_type": "stream",
38	+	"text": [
39	+	"cp: `hdfs://nn:9000/problems.jsonl': File exists\n"
40	+	]
41	+	}
42	+	],
43	+	"source": [
44	+	"!hdfs dfs -cp data/problems.jsonl hdfs://nn:9000/problems.jsonl"
45	+	]
46	+	},
47	+	{
48	+	"cell_type": "code",
49	+	"execution_count": 3,
50	+	"id": "904fd924-22a3-4bca-9579-40ae91f58cf1",
51	+	"metadata": {},
52	+	"outputs": [
53	+	{
54	+	"name": "stderr",
55	+	"output_type": "stream",
56	+	"text": [
57	+	" "
58	+	]
59	+	}
60	+	],
61	+	"source": [
62	+	"df = (spark.read.format(\"json\")\n",
63	+	" .load(\"hdfs://nn:9000/problems.jsonl\"))"
64	+	]
65	+	},
66	+	{
67	+	"cell_type": "code",
68	+	"execution_count": 4,
69	+	"id": "739f0dde-73c4-48db-aed5-8bc79c01b4d0",
70	+	"metadata": {},
71	+	"outputs": [
72	+	{
73	+	"name": "stderr",
74	+	"output_type": "stream",
75	+	"text": [
76	+	"[Stage 1:> (0 + 1) / 1]"
77	+	]
78	+	},
79	+	{
80	+	"name": "stdout",
81	+	"output_type": "stream",
82	+	"text": [
83	+	"+-------------+--------+---------+---------+---------------+----------+---------------+-------------------------+------------------+--------------------+-------------+----------+------------+------+----------+\n",
84	+	"\|cf_contest_id\|cf_index\|cf_points\|cf_rating\| cf_tags\|difficulty\|generated_tests\|is_description_translated\|memory_limit_bytes\| name\|private_tests\|problem_id\|public_tests\|source\|time_limit\|\n",
85	+	"+-------------+--------+---------+---------+---------------+----------+---------------+-------------------------+------------------+--------------------+-------------+----------+------------+------+----------+\n",
86	+	"\| 322\| A\| 500.0\| 1000\| [0]\| 7\| 93\| false\| 256000000\|322_A. Ciel and D...\| 45\| 1\| 2\| 2\| 1\|\n",
87	+	"\| 760\| D\| 1000.0\| 1600\| [1, 2]\| 10\| 51\| false\| 256000000\| 760_D. Travel Card\| 4\| 2\| 2\| 2\| 2\|\n",
88	+	"\| 569\| E\| 1500.0\| 2600\| [3, 0]\| 11\| 99\| false\| 256000000\| 569_E. New Language\| 17\| 3\| 3\| 2\| 2\|\n",
89	+	"\| 447\| B\| 1000.0\| 1000\| [0, 4]\| 8\| 100\| false\| 256000000\|447_B. DZY Loves ...\| 13\| 4\| 1\| 2\| 1\|\n",
90	+	"\| 1292\| B\| 750.0\| 1700\|[5, 6, 7, 0, 4]\| 8\| 91\| false\| 256000000\|1292_B. Aroma's S...\| 131\| 5\| 3\| 2\| 1\|\n",
91	+	"+-------------+--------+---------+---------+---------------+----------+---------------+-------------------------+------------------+--------------------+-------------+----------+------------+------+----------+\n",
92	+	"\n"
93	+	]
94	+	},
95	+	{
96	+	"name": "stderr",
97	+	"output_type": "stream",
98	+	"text": [
99	+	" "
100	+	]
101	+	}
102	+	],
103	+	"source": [
104	+	"df.limit(5).show()"
105	+	]
106	+	},
107	+	{
108	+	"cell_type": "code",
109	+	"execution_count": 5,
110	+	"id": "d9075280-aab5-4681-9d08-e3ec41b04ea8",
111	+	"metadata": {},
112	+	"outputs": [],
113	+	"source": [
114	+	"df.createOrReplaceTempView(\"problems\")"
115	+	]
116	+	},
117	+	{
118	+	"cell_type": "code",
119	+	"execution_count": 6,
120	+	"id": "9a0eb6f5-7240-496c-8773-033462bede5e",
121	+	"metadata": {},
122	+	"outputs": [
123	+	{
124	+	"name": "stderr",
125	+	"output_type": "stream",
126	+	"text": [
127	+	" "
128	+	]
129	+	},
130	+	{
131	+	"data": {
132	+	"text/plain": [
133	+	"217"
134	+	]
135	+	},
136	+	"execution_count": 6,
137	+	"metadata": {},
138	+	"output_type": "execute_result"
139	+	}
140	+	],
141	+	"source": [
142	+	"#q1\n",
143	+	"spark.table(\"problems\").rdd.filter(\n",
144	+	" lambda row: row.cf_rating >= 1600 and row.private_tests > 0 and \"_A.\" in row.name\n",
145	+	").count()"
146	+	]
147	+	},
148	+	{
149	+	"cell_type": "code",
150	+	"execution_count": 7,
151	+	"id": "78ed631a-5f73-495a-b673-dde6b8dcba02",
152	+	"metadata": {},
153	+	"outputs": [
154	+	{
155	+	"data": {
156	+	"text/plain": [
157	+	"217"
158	+	]
159	+	},
160	+	"execution_count": 7,
161	+	"metadata": {},
162	+	"output_type": "execute_result"
163	+	}
164	+	],
165	+	"source": [
166	+	"#q2\n",
167	+	"from pyspark.sql.functions import expr, col\n",
168	+	"\n",
169	+	"(\n",
170	+	" spark.table(\"problems\")\n",
171	+	" .filter(expr(\"cf_rating >= 1600\"))\n",
172	+	" .filter(expr(\"private_tests > 0\"))\n",
173	+	" .filter(col(\"name\").contains(\"_A.\"))\n",
174	+	" .count()\n",
175	+	")"
176	+	]
177	+	},
178	+	{
179	+	"cell_type": "code",
180	+	"execution_count": 8,
181	+	"id": "cce715e4-227f-4d68-94ea-86233a9f072d",
182	+	"metadata": {},
183	+	"outputs": [
184	+	{
185	+	"data": {
186	+	"text/plain": [
187	+	"217"
188	+	]
189	+	},
190	+	"execution_count": 8,
191	+	"metadata": {},
192	+	"output_type": "execute_result"
193	+	}
194	+	],
195	+	"source": [
196	+	"#q3\n",
197	+	"spark.sql(\"\"\"\n",
198	+	" SELECT COUNT(*)\n",
199	+	" FROM problems\n",
200	+	" WHERE cf_rating >= 1600\n",
201	+	" AND private_tests > 0\n",
202	+	" AND name LIKE '%_A.%'\n",
203	+	"\"\"\").collect()[0][0]"
204	+	]
205	+	},
206	+	{
207	+	"cell_type": "code",
208	+	"execution_count": null,
209	+	"id": "d22eaf4c-4801-49c1-ae01-6af4fa2f06af",
210	+	"metadata": {},
211	+	"outputs": [],
212	+	"source": []
213	+	}
214	+	],
215	+	"metadata": {
216	+	"kernelspec": {
217	+	"display_name": "Python 3 (ipykernel)",
218	+	"language": "python",
219	+	"name": "python3"
220	+	},
221	+	"language_info": {
222	+	"codemirror_mode": {
223	+	"name": "ipython",
224	+	"version": 3
225	+	},
226	+	"file_extension": ".py",
227	+	"mimetype": "text/x-python",
228	+	"name": "python",
229	+	"nbconvert_exporter": "python",
230	+	"pygments_lexer": "ipython3",
231	+	"version": "3.10.12"
232	+	}
233	+	},
234	+	"nbformat": 4,
235	+	"nbformat_minor": 5
236	+	}
237	+

worker.Dockerfile

■ ■ ■ ■ ■ ■

1 + FROM p5-base
2 +
3 + CMD [ "bash", "-c", "/spark-3.5.5-bin-hadoop3/sbin/start-worker.sh spark://boss:7077 -c 2 -m 2g && sleep infinity"]
4 +

All occurrences

		skipped 5 lines
6	6		**/__pycache__/
7	7		**/score.json
8	8		**/result.json
	9	+	nb/data
	10	+
	11	+

1	+	FROM p5-base
2	+
3	+	CMD [ "bash", "-c", "/spark-3.5.5-bin-hadoop3/sbin/start-master.sh && sleep infinity"]
4	+