注意hive中的数据类型与spark解析后的类型一致
def getResponseEncoder: Encoder[Row] = {
val schema = StructType(
StructField("price", StringType, nullable = true) ::
StructField("app_bundle_id", StringType, nullable = true) ::
StructField("interactive_page_id", StringType, nullable = true) ::
StructField("source_type", StringType, nullable = true) ::
Nil
)
RowEncoder(schema)
}
比如上面的price在hive中parquet格式的数据类型是bigint,但是你spark解析的代码却是StringType类型,导致类型不匹配。报错
Caused by: java.lang.RuntimeException: java.lang.Long is not a valid external type for schema of string
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:377)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:231)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:225)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:827)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:827)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:99)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:325)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:748)
版权声明:本文为qq_43193797原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。