EMR Config for Big cluster to create Many Paritions in Hive
EMR Configuration :
https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-configure-apps.html
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
[ { "classification": "hive-site", "properties": { "hive.fetch.task.conversion": "none", "hive.support.concurrency": "true", "hive.exec.parallel": "true", "hive.stats.autogather": "false", "hive.execution.engine": "tez", "hive.tez.container.size": "60240", "hive.tez.java.opts": "-Xmx48192m", "hive.exec.dynamic.partition.mode": "nonstrict", "hive.strict.checks.cartesian.product": "false", "hive.exec.max.dynamic.partitions.pernode": "9000000", "hive.exec.max.dynamic.partitions": "9000000", "hive.exec.max.created.files": "10000000", "hive.optimize.sort.dynamic.partition": "true" } }, { "classification": "tez-site", "properties": { "tez.session.am.dag.submit.timeout.secs": "600", "tez.am.grouping.max-size": "134217728", "tez.am.resource.memory.mb": "60240", "tez.session.client.timeout.secs": "3600", "tez.task.resource.memory.mb": "60240", "tez.am.container.reuse.enabled": "true", "tez.am.java.opts": "-server -Xmx48192m -Djava.net.preferIPv4Stack=true" } }, { "classification": "yarn-site", "properties": { "yarn.resourcemanager.scheduler.class": "org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler", "yarn.nodemanager.recovery.enabled": "true" } }, { "classification": "hdfs-site", "properties": { "fs.replication": "1" } }, { "Classification": "hadoop-env", "Properties": {}, "Configurations": [ { "Classification": "export", "Properties": { "HADOOP_NAMENODE_HEAPSIZE": "10096", "HADOOP_DATANODE_HEAPSIZE": "6048" }, "Configurations": [] } ] }, { "Classification": "hive-env", "Properties": {}, "Configurations": [ { "Classification": "export", "Properties": { "HADOOP_HEAPSIZE": "20480" }, "Configurations": [] } ] } ] |
Create Cluster CLI Command :
https://docs.aws.amazon.com/cli/latest/reference/emr/create-cluster.html
In below command, search for <code>replace</code> and replace them with relevant parameters like your own security groups, ssh key , IAM Roles, EMR Log bucket etc.
1 |
aws emr create-cluster --termination-protected --applications Name=Hadoop Name=Hive Name=Pig Name=Hue Name=Spark --ec2-attributes '{"KeyName":"replace.pem","InstanceProfile":"EMR_EC2_DefaultRole","SubnetId":"subnet-replace","EmrManagedSlaveSecurityGroup":"sg-replace","EmrManagedMasterSecurityGroup":"sg-replace"}' --release-label emr-5.27.0 --log-uri 's3n://replace/elasticmapreduce/' --instance-groups '[{"InstanceCount":5,"EbsConfiguration":{"EbsBlockDeviceConfigs":[{"VolumeSpecification":{"SizeInGB":32,"VolumeType":"gp2"},"VolumesPerInstance":2}]},"InstanceGroupType":"CORE","InstanceType":"c5.9xlarge","Name":"Core - 2"},{"InstanceCount":1,"EbsConfiguration":{"EbsBlockDeviceConfigs":[{"VolumeSpecification":{"SizeInGB":32,"VolumeType":"gp2"},"VolumesPerInstance":2}]},"InstanceGroupType":"MASTER","InstanceType":"c5.9xlarge","Name":"Master - 1"},{"InstanceCount":0,"BidPrice":"OnDemandPrice","EbsConfiguration":{"EbsBlockDeviceConfigs":[{"VolumeSpecification":{"SizeInGB":32,"VolumeType":"gp2"},"VolumesPerInstance":2}]},"InstanceGroupType":"TASK","InstanceType":"m5.xlarge","Name":"Task"}]' --configurations '[{"Classification":"hive-site","Properties":{"hive.exec.max.dynamic.partitions":"9000000","hive.support.concurrency":"true","hive.tez.container.size":"60240","hive.strict.checks.cartesian.product":"false","hive.lock.manager":"org.apache.hadoop.hive.ql.lockmgr.EmbeddedLockManager","hive.exec.parallel":"true","hive.exec.dynamic.partition.mode":"nonstrict","hive.tez.java.opts":"-Xmx48192m","hive.exec.max.dynamic.partitions.pernode":"9000000","hive.fetch.task.conversion":"none","hive.exec.max.created.files":"10000000","hive.stats.autogather":"false","hive.execution.engine":"tez","hive.optimize.sort.dynamic.partition":"true"}},{"Classification":"tez-site","Properties":{"tez.session.am.dag.submit.timeout.secs":"600","tez.am.log.level":"DEBUG","tez.task.resource.memory.mb":"60240","tez.am.grouping.max-size":"134217728","tez.am.resource.memory.mb":"60240","tez.session.client.timeout.secs":"3600","tez.am.container.reuse.enabled":"true","tez.am.java.opts":"-server -Xmx48192m -Djava.net.preferIPv4Stack=true"}},{"Classification":"yarn-site","Properties":{"yarn.resourcemanager.scheduler.class":"org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler","yarn.nodemanager.recovery.enabled":"true"}},{"Classification":"hdfs-site","Properties":{"fs.replication":"1"}}]' --auto-scaling-role EMR_AutoScaling_DefaultRole --ebs-root-volume-size 10 --service-role EMR_DefaultRole --enable-debugging --name 'spark test 600k partitions big cluster' --scale-down-behavior TERMINATE_AT_TASK_COMPLETION --region us-east-1 |