事象
PySpark で DataFrame にリテラルで列を追加しようとすると "col should be Column" と怒られる。
- コード
import sys from awsglue.transforms import * from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.job import Job from pyspark.sql import SQLContext from pyspark.sql.functions import year, month, date_format sc = SparkContext.getOrCreate() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init('addYearColumn') datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "ssbgz", table_name = "gz_customer", transformation_ctx = "datasource0") df = datasource0.toDF() df.cache() yearAddedDf = df.withColumn("yyyy", "1001")
- エラーメッセージ
col should be Column Traceback (most recent call last): File "/mnt/yarn/usercache/livy/appcache/application_1568634099044_0001/container_1568634099044_0001_01_000001/pyspark.zip/pyspark/sql/dataframe.py", line 1619, in withColumn assert isinstance(col, Column), "col should be Column" AssertionError: col should be Column
解決策
import sys from awsglue.transforms import * from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.job import Job from pyspark.sql import SQLContext from pyspark.sql.functions import year, month, date_format from pyspark.sql.functions import lit # ★ literal を import sc = SparkContext.getOrCreate() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init('addYearColumn') datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "ssbgz", table_name = "gz_customer", transformation_ctx = "datasource0") df = datasource0.toDF() df.cache() yearAddedDf = df.withColumn("yyyy", lit('1001')) # ★ literal で囲む