From dcd3e005102b841b5d098ee743a37bde0d7efce4 Mon Sep 17 00:00:00 2001 From: Michael Kamprath Date: Sun, 27 Oct 2019 10:07:03 -0700 Subject: [PATCH] added example jupyter notebook --- .../prime-numbers/prime-numbers.ipynb | 113 ++++++++++++++++++ simple-spark-swarm/deploy-spark-swarm.yml | 2 +- 2 files changed, 114 insertions(+), 1 deletion(-) create mode 100644 jupyter-notebooks/prime-numbers/prime-numbers.ipynb diff --git a/jupyter-notebooks/prime-numbers/prime-numbers.ipynb b/jupyter-notebooks/prime-numbers/prime-numbers.ipynb new file mode 100644 index 0000000..477340c --- /dev/null +++ b/jupyter-notebooks/prime-numbers/prime-numbers.ipynb @@ -0,0 +1,113 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pyspark.sql.functions as F\n", + "import pyspark.sql.types as T\n", + "\n", + "spark = SparkSession\\\n", + " .builder\\\n", + " .appName(\"CalculatePrimeNumbers\")\\\n", + " .getOrCreate()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "MAX_VALUE = 1000000000" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Algorithm reference:\n", + "#\thttps://en.wikipedia.org/wiki/Primality_test\n", + "\n", + "def isPrime(val):\n", + " if val <= 3:\n", + " return val > 1\n", + " elif val%2 == 0 or val%3 == 0:\n", + " return False\n", + " else:\n", + " i = 5\n", + " while i*i < val:\n", + " if val%i == 0 or val%(i + 2) == 0:\n", + " return False\n", + " i += 6\n", + " return True\n", + "\n", + "values = spark.sparkContext.parallelize(\n", + " range(1,MAX_VALUE+1), \n", + " 2000\n", + " ).map(\n", + " lambda x: (x, isPrime(x))\n", + " ).toDF().withColumnRenamed('_1', 'value').withColumnRenamed('_2', 'is_prime').cache()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "values.count()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "values.filter(F.col('is_prime')).count()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "values.filter(F.col('is_prime')).agg(F.max('value')).collect()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/simple-spark-swarm/deploy-spark-swarm.yml b/simple-spark-swarm/deploy-spark-swarm.yml index 517c50c..066485e 100644 --- a/simple-spark-swarm/deploy-spark-swarm.yml +++ b/simple-spark-swarm/deploy-spark-swarm.yml @@ -60,7 +60,7 @@ services: replicas: 4 resources: limits: - cpus: "8.0" + cpus: "6.0" memory: 52g spark-jupyter: