diff --git a/distroduce/bash/launch_cluster.sh b/distroduce/bash/launch_cluster.sh deleted file mode 100644 index c49de60..0000000 --- a/distroduce/bash/launch_cluster.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash -aws emr create-cluster --auto-scaling-role EMR_AutoScaling_DefaultRole \ ---applications Name=Hadoop Name=Hive Name=Spark \ ---bootstrap-actions '[{"Path":"s3://insightde/emr/bootstraps/bootstrap.sh","Name":"bootstrap"}]' \ ---ebs-root-volume-size 10 \ ---ec2-attributes '{"KeyName":"joshua-IAM-keypair","InstanceProfile":"EMR_EC2_DefaultRole","SubnetId":"subnet-0034e615b047fd112","EmrManagedSlaveSecurityGroup":"sg-08e546ae27d86d6a3","EmrManagedMasterSecurityGroup":"sg-08e546ae27d86d6a3"}' \ ---service-role EMR_DefaultRole --release-label emr-5.26.0 --name 'test_cluster12d' \ ---instance-groups '[{"InstanceCount":1,"EbsConfiguration":{"EbsBlockDeviceConfigs":[{"VolumeSpecification":{"SizeInGB":32,"VolumeType":"gp2"},"VolumesPerInstance":2}]},"InstanceGroupType":"MASTER","InstanceType":"m4.xlarge","Name":"Master - 1"},{"InstanceCount":5,"EbsConfiguration":{"EbsBlockDeviceConfigs":[{"VolumeSpecification":{"SizeInGB":32,"VolumeType":"gp2"},"VolumesPerInstance":2}]},"InstanceGroupType":"CORE","InstanceType":"m4.xlarge","Name":"Core - 2"}]' --configurations '[{"Classification":"spark-env","Properties":{},"Configurations":[{"Classification":"export","Properties":{}}]},{"Classification":"spark-defaults","Properties":{}},{"Classification":"spark","Properties":{"maximizeResourceAllocation":"true"}}]' \ ---scale-down-behavior TERMINATE_AT_TASK_COMPLETION --region us-east-1 \ No newline at end of file diff --git a/distroduce/bash/spark.json b/distroduce/bash/spark.json deleted file mode 100644 index affb901..0000000 --- a/distroduce/bash/spark.json +++ /dev/null @@ -1,26 +0,0 @@ -[ - { - "Classification": "spark-env", - "Configurations": [ - { - "Classification": "export", - "ConfigurationProperties": { - "PYSPARK_PYTHON": "/usr/bin/python3", - "PYSPARK_DRIVER_PYTHON": "/usr/bin/python3" - } - } - ] - }, - { - "Classification": "spark-defaults", - "ConfigurationProperties": { - "spark.sql.execution.arrow.enabled": "true" - } - }, - { - "Classification": "spark", - "Properties": { - "maximizeResourceAllocation": "true" - } - } -] diff --git a/distroduce/bash/build_source.sh b/distroduce/configuration/build_source.sh similarity index 100% rename from distroduce/bash/build_source.sh rename to distroduce/configuration/build_source.sh diff --git a/distroduce/bash/distroduce.sh b/distroduce/configuration/distroduce.sh similarity index 99% rename from distroduce/bash/distroduce.sh rename to distroduce/configuration/distroduce.sh index 8cd762a..9034c0a 100644 --- a/distroduce/bash/distroduce.sh +++ b/distroduce/configuration/distroduce.sh @@ -5,7 +5,6 @@ yes | sudo python3 -m pip install pathos kafka-python wget https://raw.githubusercontent.com/JEJodesty/cadCAD/dev/dist/cadCAD-0.0.2-py3-none-any.whl yes | sudo python3 -m pip install cadCAD-0.0.2-py3-none-any.whl - # check for master node IS_MASTER=false if grep -i isMaster /mnt/var/lib/info/instance.json | grep -i true; diff --git a/distroduce/bash/gen_bootstrap.sh b/distroduce/configuration/gen_bootstrap.sh similarity index 100% rename from distroduce/bash/gen_bootstrap.sh rename to distroduce/configuration/gen_bootstrap.sh diff --git a/distroduce/configuration/launch_cluster.py b/distroduce/configuration/launch_cluster.py new file mode 100644 index 0000000..7a8371c --- /dev/null +++ b/distroduce/configuration/launch_cluster.py @@ -0,0 +1,125 @@ +import os, json, boto3 +from typing import List + +ec2_attributes = { + "KeyName":"joshua-IAM-keypair", + "InstanceProfile":"EMR_EC2_DefaultRole", + "SubnetId":"subnet-0034e615b047fd112", + "EmrManagedSlaveSecurityGroup":"sg-08e546ae27d86d6a3", + "EmrManagedMasterSecurityGroup":"sg-08e546ae27d86d6a3" +} + +bootstrap_actions = [ + { + "Path":"s3://insightde/emr/bootstraps/distroduce.sh", + "Name":"bootstrap" + } +] + +instance_groups = [ + { + "InstanceCount":5, + "EbsConfiguration": + {"EbsBlockDeviceConfigs": + [ + { + "VolumeSpecification": + {"SizeInGB":32,"VolumeType":"gp2"}, + "VolumesPerInstance":2 + } + ] + }, + "InstanceGroupType":"CORE", + "InstanceType":"m4.xlarge", + "Name":"Core - 2" + }, + { + "InstanceCount":1, + "EbsConfiguration": + { + "EbsBlockDeviceConfigs": + [ + { + "VolumeSpecification": + {"SizeInGB":32,"VolumeType":"gp2"}, + "VolumesPerInstance":2 + } + ] + }, + "InstanceGroupType":"MASTER", + "InstanceType":"m4.xlarge", + "Name":"Master - 1" + } +] + +configurations = [ + { + "Classification":"spark-env", + "Properties":{}, + "Configurations": + [ + { + "Classification":"export", + "Properties":{ + "PYSPARK_PYTHON": "/usr/bin/python3", + "PYSPARK_DRIVER_PYTHON": "/usr/bin/python3" + } + } + ] + }, + { + "Classification":"spark-defaults", + "Properties":{ + "spark.sql.execution.arrow.enabled": "true" + } + }, + { + "Classification":"spark", + "Properties":{ + "maximizeResourceAllocation":"true" + } + } +] + + + +def create_distroduce_cluster(name, region, ec2_attributes, bootstrap_actions, instance_groups, configurations): + def log_uri(name, region): + return f's3n://{name}-{region}/elasticmapreduce/' + + os.system(f""" + aws emr create-cluster \ + --applications Name=Hadoop Name=Hive Name=Spark \ + --ec2-attributes '{json.dumps(ec2_attributes)}' \ + --release-label emr-5.26.0 \ + --log-uri '{str(log_uri(name, region))}' \ + --instance-groups '{json.dumps(instance_groups)}' \ + --configurations '{json.dumps(configurations)}' \ + --auto-scaling-role EMR_AutoScaling_DefaultRole \ + --bootstrap-actions '{json.dumps(bootstrap_actions)}' \ + --ebs-root-volume-size 10 \ + --service-role EMR_DefaultRole \ + --enable-debugging \ + --name '{name}' \ + --scale-down-behavior TERMINATE_AT_TASK_COMPLETION \ + --region {region} + """) + + +def benchmark(names: List[str], region, ec2_attributes, bootstrap_actions, instance_groups, configurations): + current_dir = os.path.dirname(__file__) + s3 = boto3.client('s3') + bucket = 'insightde' + + file = 'distroduce.sh' + abs_path = os.path.join(current_dir, file) + key = f'emr/bootstraps/{file}' + + s3.upload_file(abs_path, bucket, key) + for name in names: + create_distroduce_cluster(name, region, ec2_attributes, bootstrap_actions, instance_groups, configurations) + + +name = 'distibuted_produce' +region = 'us-east-1' +benchmark([name], region, ec2_attributes, bootstrap_actions, instance_groups, configurations) \ No newline at end of file diff --git a/distroduce/configuration/launch_cluster.sh b/distroduce/configuration/launch_cluster.sh new file mode 100644 index 0000000..0233fbf --- /dev/null +++ b/distroduce/configuration/launch_cluster.sh @@ -0,0 +1,16 @@ +#!/bin/bash +aws emr create-cluster \ +--applications Name=Hadoop Name=Hive Name=Spark \ +--ec2-attributes '{"KeyName":"joshua-IAM-keypair","InstanceProfile":"EMR_EC2_DefaultRole","SubnetId":"subnet-0034e615b047fd112","EmrManagedSlaveSecurityGroup":"sg-08e546ae27d86d6a3","EmrManagedMasterSecurityGroup":"sg-08e546ae27d86d6a3"}' \ +--release-label emr-5.26.0 \ +--log-uri 's3n://aws-logs-251682129355-us-east-1/elasticmapreduce/' \ +--instance-groups '[{"InstanceCount":5,"EbsConfiguration":{"EbsBlockDeviceConfigs":[{"VolumeSpecification":{"SizeInGB":32,"VolumeType":"gp2"},"VolumesPerInstance":2}]},"InstanceGroupType":"CORE","InstanceType":"m4.xlarge","Name":"Core - 2"},{"InstanceCount":1,"EbsConfiguration":{"EbsBlockDeviceConfigs":[{"VolumeSpecification":{"SizeInGB":32,"VolumeType":"gp2"},"VolumesPerInstance":2}]},"InstanceGroupType":"MASTER","InstanceType":"m4.xlarge","Name":"Master - 1"}]' \ +--configurations '[{"Classification":"spark-env","Properties":{},"Configurations":[{"Classification":"export","Properties":{}}]},{"Classification":"spark-defaults","Properties":{}},{"Classification":"spark","Properties":{"maximizeResourceAllocation":"true"}}]' \ +--auto-scaling-role EMR_AutoScaling_DefaultRole \ +--bootstrap-actions '[{"Path":"s3://insightde/emr/bootstraps/distroduce.sh","Name":"bootstrap"}]' \ +--ebs-root-volume-size 10 \ +--service-role EMR_DefaultRole \ +--enable-debugging \ +--name 'distibuted_produce' \ +--scale-down-behavior TERMINATE_AT_TASK_COMPLETION \ +--region us-east-1 \ No newline at end of file diff --git a/distroduce/bash/run.sh b/distroduce/configuration/run.sh similarity index 100% rename from distroduce/bash/run.sh rename to distroduce/configuration/run.sh diff --git a/distroduce/bash/spark_submit.sh b/distroduce/configuration/spark_submit.sh similarity index 50% rename from distroduce/bash/spark_submit.sh rename to distroduce/configuration/spark_submit.sh index c72713f..99cec53 100644 --- a/distroduce/bash/spark_submit.sh +++ b/distroduce/configuration/spark_submit.sh @@ -1,4 +1,3 @@ #!/bin/bash PRIVATE_IP=`hostname -I | xargs` -sudo sed -i -e '$a\export PYSPARK_PYTHON=/usr/bin/python3' /etc/spark/conf/spark-env.sh spark-submit --master yarn --py-files distroduce.zip messaging_sim.py $PRIVATE_IP \ No newline at end of file diff --git a/distroduce/bash/toEC2.sh b/distroduce/configuration/toEC2.sh similarity index 99% rename from distroduce/bash/toEC2.sh rename to distroduce/configuration/toEC2.sh index da3ca27..48f4d57 100644 --- a/distroduce/bash/toEC2.sh +++ b/distroduce/configuration/toEC2.sh @@ -1,5 +1,4 @@ #!/bin/bash - scp -i ~/.ssh/joshua-IAM-keypair.pem ~/Projects/event-bench/event_bench.py \ hadoop@ec2-3-230-158-62.compute-1.amazonaws.com:/home/hadoop/