$ cd /opt
$ sudo mkdir apache-spark apache-maven
$ sudo yum -y install java-1.8.0-openjdk-devel.x86_64
$ java -version
$ curl -O https://aws-glue-etl-artifacts.s3.amazonaws.com/glue-common/apache-maven-3.6.0-bin.tar.gz
$ sudo tar zxvf apache-maven-3.6.0-bin.tar.gz -C /opt/apache-maven
$ curl -O https://aws-glue-etl-artifacts.s3.amazonaws.com/glue-1.0/spark-2.4.3-bin-hadoop2.8.tgz
$ tar zxvf spark-2.4.3-bin-hadoop2.8.tgz -C /opt/apache-spark
$ vi ~/.bash_profile
export SPARK_HOME=/opt/spark/spark-2.4.3-bin-spark-2.4.3-bin-hadoop2.8
export PATH=/opt/maven/apache-maven-3.6.0/bin:$SPARK_HOME/bin:$PATH
$ source ~/.bash_profile
$ git clone https://github.com/awslabs/aws-glue-libs.git
$ wget https://raw.githubusercontent.com/ktsmy/study-aws-glue/master/pom.xml
$ spark-shell
21/02/26 07:06:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
Spark context Web UI available at http://ip-172-31-21-108.ap-northeast-1.compute.internal:4040
Spark context available as 'sc' (master = local[*], app id = local-1614323167359).
Spark session available as 'spark'.
Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/___/ .__/\_,_/_/ /_/\_\ version 2.4.3
/_/
Using Scala version 2.11.12 (OpenJDK 64-Bit Server VM, Java 1.8.0_272)
Type in expressions to have them evaluated.
Type :help for more information.
scala> :q
$ sudo yum -y install git \
bzip2 \
bzip2-devel \
gcc \
git \
libffi-devel \
make \
openssl \
openssl-devel \
readline \
readline-devel \
sqlite \
sqlite-devel \
zlib-devel \
patch
$ curl -L https://raw.githubusercontent.com/pyenv/pyenv-installer/master/bin/pyenv-installer | bash
$ cat << 'EOS' >> ~/.bashrc
export PATH="~/.pyenv/bin:$PATH"
eval "$(pyenv init -)"
eval "$(pyenv virtualenv-init -)"
EOS
$ source ~/.bashrc
$ pyenv install 2.7.1
$ pyenv global 2.7.1
$ pyenv rehash
$ pyenv versions
$ sudo yum -y install python-pip
$ pip install pyspark
$ sudo vi /opt/apache-spark/spark-2.4.3-bin-spark-2.4.3-bin-hadoop2.8/conf/spark.conf
spark.hadoop.dynamodb.endpoint http://localhost:4569
spark.hadoop.fs.s3a.endpoint http://localhost:4572
spark.hadoop.fs.s3a.path.style.access true
spark.hadoop.fs.s3a.signing-algorithm S3SignerType
使ってみる
$ ./bin/gluepyspark