
Spark在Shuffle阶段临时文件及RDD持久化存储会放到spark.local.dir/SPARK_LOCAL_DIRS下,使用逗号分隔可以配置多个不同磁盘的路径。在Spark on Yarn下,这个路径会被Yarn集群的配置LOCAL_DIRS所取代。在Spark on k8s下,这个路径默认是emptyDir Volume所取代,可以简单的理解为java.io.tmpDir对应的/tmp目录。
Spark有4种方式可以把K8S Volumn挂载到Executor/Driver Pod中。
hostPath: mounts a file or directory from the host node’s filesystem into a pod.
emptyDir: an initially empty volume created when a pod is assigned to a node.
nfs: mounts an existing NFS(Network File System) into a pod.
persistentVolumeClaim: mounts a PersistentVolume
into a pod.
$ mkdir dynamic-provisioning$ cd dynamic-provisioning$ cat > juicefs-sc-secret.sh << EOF$ kubectl -n kube-system create secret generic juicefs-sc-secret \--from-literal=name=mhdfs \--from-literal=metaurl=redis://:123456@172.16.2.120:16379/2 \--from-literal=storage=hdfs \--from-literal=bucket=172.16.2.119:8020 \--from-literal=access-key=hdfsEOF$ cat > storageClass.yaml << EOFapiVersion: storage.k8s.io/v1kind: StorageClassmetadata:name: juicefs-scprovisioner: csi.juicefs.comparameters:csi.storage.k8s.io/node-publish-secret-name: juicefs-sc-secretcsi.storage.k8s.io/node-publish-secret-namespace: kube-systemcsi.storage.k8s.io/provisioner-secret-name: juicefs-sc-secretcsi.storage.k8s.io/provisioner-secret-namespace: kube-systemreclaimPolicy: DeletevolumeBindingMode: ImmediateEOF$ cat > persistentVolumeClaim.yaml << EOFapiVersion: v1kind: PersistentVolumeClaimmetadata:name: jtext-pvcspec:accessModes:- ReadWriteManyresources:requests:storage: 1GistorageClassName: juicefs-scEOF$ sh juicefs-sc-secret.shsecret/juicefs-sc-secret created$ kubectl apply -f storageClass.yamlstorageclass.storage.k8s.io/juicefs-sc created$ kubectl apply -f persistentVolumeClaim.yamlpersistentvolumeclaim/jtext-pvc created
cat > testp.yaml << EOFapiVersion: apps/v1kind: Deploymentmetadata:name: spark-hellonamespace: defaultspec:selector:matchLabels:app: spark-hellostrategy:rollingUpdate:maxUnavailable: 0type: RollingUpdatetemplate:metadata:labels:app: spark-hellospec:serviceAccountName: sparkcontainers:- name: spark-helloargs: [ "while true; do sleep 10000; done;" ]command:- /bin/sh- '-c'image: '172.16.2.66:5000/mlsql:3.0-j14-mlsql'imagePullPolicy: AlwayssecurityContext:runAsUser: 0volumeMounts:- name: juicefs-pvmountPath: /opt/mlsql/jarsubPath: mlsql/mlsqljar- name: juicefs-pvmountPath: /opt/spark/confsubPath: mlsql/sparkconf- name: juicefs-pvmountPath: /opt/mlsql/scriptsubPath: mlsql/scriptvolumes:- name: juicefs-pvpersistentVolumeClaim:claimName: one-gb-fsEOF
cat > mlsql-start-d.sh << EOFip=$(cat etc/hosts | head -n 8 | tail -n 1 | awk '{print $1}')echo $ip/opt/spark/bin/spark-submit --master k8s://https://172.16.2.62:6443 \--deploy-mode client \--class streaming.core.StreamingApp \--conf spark.kubernetes.container.image=172.16.2.66:5000/mlsql:3.0-j14-mlsql \--conf spark.kubernetes.container.image.pullPolicy=Always \--conf spark.kubernetes.namespace=default \--conf spark.kubernetes.executor.request.cores=0.05 \--conf spark.kubernetes.executor.limit.cores=0.3 \--conf spark.kubernetes.executor.volumes.persistentVolumeClaim.spark-local-dir-1.options.claimName=jtext-pvc \--conf spark.kubernetes.executor.volumes.persistentVolumeClaim.spark-local-dir-1.options.sizeLimit=1Gi \--conf spark.kubernetes.executor.volumes.persistentVolumeClaim.spark-local-dir-1.mount.path=/data \--conf spark.kubernetes.executor.volumes.persistentVolumeClaim.spark-local-dir-1.mount.subPath=cool \--conf spark.kubernetes.executor.volumes.persistentVolumeClaim.spark-local-dir-1.mount.readOnly=false\--conf spark.dynamicAllocation.enabled=true \--conf spark.dynamicAllocation.shuffleTracking.enabled=true \--conf spark.dynamicAllocation.minExecutors=3 \--conf spark.dynamicAllocation.maxExecutors=4 \--conf spark.dynamicAllocation.executorIdleTimeout=60 \--conf spark.jars.ivy=/tmp/.ivy \--conf spark.driver.host=$ip \--conf spark.sql.cbo.enabled=true \--conf spark.sql.adaptive.enabled=true \--conf spark.sql.cbo.joinReorder.enabled=true \--conf spark.sql.cbo.planStats.enabled=true \--conf spark.sql.cbo.starSchemaDetection=true \--conf spark.driver.maxResultSize=512m \--conf spark.executor.memory=512m \--conf spark.driver.memory=512m \--conf spark.serializer=org.apache.spark.serializer.KryoSerializer \--conf spark.kryoserializer.buffer.max=100m \--conf spark.executor.extraJavaOptions="-XX:+UnlockExperimentalVMOptions -XX:+UseZGC -XX:+UseContainerSupport -Dio.netty.tryReflectionSetAccessible=true" \--conf spark.driver.extraJavaOptions="-XX:+UnlockExperimentalVMOptions -XX:+UseZGC -XX:+UseContainerSupport -Dio.netty.tryReflectionSetAccessible=true -DREALTIME_LOG_HOME=/tmp/__mlsql__/logs" \--jars opt/mlsql/jar/juicefs-hadoop-0.11.0.jar \opt/mlsql/jar/streamingpro-mlsql-spark_3.0_2.12-2.1.0-SNAPSHOT.jar \-streaming.name mlsql \-streaming.rest true \-streaming.thrift false \-streaming.platform spark \-streaming.enableHiveSupport true \-streaming.spark.service true \-streaming.job.cancel true \-streaming.driver.port 9003EOF# 注意这个参数:spark.kubernetes.executor.volumes.persistentVolumeClaim.spark-local-dir-1.options.claimName=jtext-pvc# Spark文档说明:spark.kubernetes.driver.volumes.[VolumeType].[VolumeName].options.[OptionName]=<value># VolumnName必须以 spark-local-dir- 开头。可以挂载多个。
kubectl create -f tesp.yamlkubectl get pods -o widedocker ps | grep hellodocker exec -it container_id bin/shcd opt/mlsqlnohup sh mlsql-start-d.sh &curl -XPOST hello.mlsql.com/run/script -d 'sql=select 1 a as t;'
hadoop fs -ls jfs://test/drwxr-xr-x - hdfs hdfs 4096 2021-03-06 23:31 jfs://test/mlsqldrwxr-xr-x - hdfs hdfs 4096 2021-03-06 23:12 jfs://test/pvc-a5ea07cc-8f4f-475f-98ef-fc67d77c0dc7[root@t1-27-70 jfsh]# hadoop fs -ls jfs://test/pvc-a5ea07cc-8f4f-475f-98ef-fc67d77c0dc7/cooldrwxr-xr-x - hdfs hdfs 4096 2021-03-07 18:11 jfs://test/pvc-a5ea07cc-8f4f-475f-98ef-fc67d77c0dc7/cool/blockmgr-1d640b9c-a9b9-4158-922a-faf09c7bc94cdrwxr-xr-x - hdfs hdfs 4096 2021-03-07 18:11 jfs://test/pvc-a5ea07cc-8f4f-475f-98ef-fc67d77c0dc7/cool/blockmgr-9c4ba9b4-f8cc-4b86-9e19-3bf2636bc4ebdrwxr-xr-x - hdfs hdfs 4096 2021-03-07 18:07 jfs://test/pvc-a5ea07cc-8f4f-475f-98ef-fc67d77c0dc7/cool/blockmgr-c0cb6f2f-0303-4ee5-b0b2-64be3fb5c2ecdrwx------ - hdfs hdfs 4096 2021-03-07 18:07 jfs://test/pvc-a5ea07cc-8f4f-475f-98ef-fc67d77c0dc7/cool/spark-44b33eef-0c64-4626-ba59-c5c18fffca35drwx------ - hdfs hdfs 4096 2021-03-07 18:11 jfs://test/pvc-a5ea07cc-8f4f-475f-98ef-fc67d77c0dc7/cool/spark-4c13d70a-e5ca-4137-9496-b732a2a36e56drwx------ - hdfs hdfs 4096 2021-03-07 18:11 jfs://test/pvc-a5ea07cc-8f4f-475f-98ef-fc67d77c0dc7/cool/spark-7ea25290-b70e-4305-9d30-843a8887a7d5

图片素材1:卢浮宫
图片素材2:互联网

喜欢就点击最上方的[ MLSQL之道 ]关注下吧!右下角还有在看哦!
源码地址:
https://github.com/latincross/mlsqlwechat
文章转载自MLSQL之道,如果涉嫌侵权,请发送邮件至:contact@modb.pro进行举报,并提供相关证据,一经查实,墨天轮将立刻删除相关内容。




