125 lines
3.6 KiB
Python
125 lines
3.6 KiB
Python
import os, json, boto3
|
|
from typing import List
|
|
|
|
ec2_attributes = {
|
|
"KeyName":"joshua-IAM-keypair",
|
|
"InstanceProfile":"EMR_EC2_DefaultRole",
|
|
"SubnetId":"subnet-0034e615b047fd112",
|
|
"EmrManagedSlaveSecurityGroup":"sg-08e546ae27d86d6a3",
|
|
"EmrManagedMasterSecurityGroup":"sg-08e546ae27d86d6a3"
|
|
}
|
|
|
|
bootstrap_actions = [
|
|
{
|
|
"Path":"s3://insightde/emr/bootstraps/distroduce.sh",
|
|
"Name":"bootstrap"
|
|
}
|
|
]
|
|
|
|
instance_groups = [
|
|
{
|
|
"InstanceCount":5,
|
|
"EbsConfiguration":
|
|
{"EbsBlockDeviceConfigs":
|
|
[
|
|
{
|
|
"VolumeSpecification":
|
|
{"SizeInGB":32,"VolumeType":"gp2"},
|
|
"VolumesPerInstance":2
|
|
}
|
|
]
|
|
},
|
|
"InstanceGroupType":"CORE",
|
|
"InstanceType":"m4.xlarge",
|
|
"Name":"Core - 2"
|
|
},
|
|
{
|
|
"InstanceCount":1,
|
|
"EbsConfiguration":
|
|
{
|
|
"EbsBlockDeviceConfigs":
|
|
[
|
|
{
|
|
"VolumeSpecification":
|
|
{"SizeInGB":32,"VolumeType":"gp2"},
|
|
"VolumesPerInstance":2
|
|
}
|
|
]
|
|
},
|
|
"InstanceGroupType":"MASTER",
|
|
"InstanceType":"m4.xlarge",
|
|
"Name":"Master - 1"
|
|
}
|
|
]
|
|
|
|
configurations = [
|
|
{
|
|
"Classification":"spark-env",
|
|
"Properties":{},
|
|
"Configurations":
|
|
[
|
|
{
|
|
"Classification":"export",
|
|
"Properties":{
|
|
"PYSPARK_PYTHON": "/usr/bin/python3",
|
|
"PYSPARK_DRIVER_PYTHON": "/usr/bin/python3"
|
|
}
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"Classification":"spark-defaults",
|
|
"Properties":{
|
|
"spark.sql.execution.arrow.enabled": "true"
|
|
}
|
|
},
|
|
{
|
|
"Classification":"spark",
|
|
"Properties":{
|
|
"maximizeResourceAllocation":"true"
|
|
}
|
|
}
|
|
]
|
|
|
|
|
|
|
|
def create_distroduce_cluster(name, region, ec2_attributes, bootstrap_actions, instance_groups, configurations):
|
|
def log_uri(name, region):
|
|
return f's3n://{name}-{region}/elasticmapreduce/'
|
|
|
|
os.system(f"""
|
|
aws emr create-cluster \
|
|
--applications Name=Hadoop Name=Hive Name=Spark \
|
|
--ec2-attributes '{json.dumps(ec2_attributes)}' \
|
|
--release-label emr-5.26.0 \
|
|
--log-uri '{str(log_uri(name, region))}' \
|
|
--instance-groups '{json.dumps(instance_groups)}' \
|
|
--configurations '{json.dumps(configurations)}' \
|
|
--auto-scaling-role EMR_AutoScaling_DefaultRole \
|
|
--bootstrap-actions '{json.dumps(bootstrap_actions)}' \
|
|
--ebs-root-volume-size 10 \
|
|
--service-role EMR_DefaultRole \
|
|
--enable-debugging \
|
|
--name '{name}' \
|
|
--scale-down-behavior TERMINATE_AT_TASK_COMPLETION \
|
|
--region {region}
|
|
""")
|
|
|
|
|
|
def benchmark(names: List[str], region, ec2_attributes, bootstrap_actions, instance_groups, configurations):
|
|
current_dir = os.path.dirname(__file__)
|
|
s3 = boto3.client('s3')
|
|
bucket = 'insightde'
|
|
|
|
file = 'distroduce.sh'
|
|
abs_path = os.path.join(current_dir, file)
|
|
key = f'emr/bootstraps/{file}'
|
|
|
|
s3.upload_file(abs_path, bucket, key)
|
|
for name in names:
|
|
create_distroduce_cluster(name, region, ec2_attributes, bootstrap_actions, instance_groups, configurations)
|
|
|
|
|
|
name = 'distibuted_produce'
|
|
region = 'us-east-1'
|
|
benchmark([name], region, ec2_attributes, bootstrap_actions, instance_groups, configurations) |