【发布时间】:2014-01-13 22:43:36
【问题描述】:
我正在尝试使用 Hive JSON SerDe 将 Twitter JSON 放入 Hive 表中。我首先将 JSON 导入到由 ROW FORMAT SERDE 定义的一个表中,然后将其导入到另一个存储为 RCFile 的表中。它可以工作到一定程度,但后来我得到了以下性质的 ClassCastException:
java.lang.RuntimeException: org.apache.hadoop.hive.ql.metadata.HiveException: Hive Runtime Error while processing row [Error getting row data with exception java.lang.ClassCastException: java.lang.Integer cannot be cast to java.lang.Double
at org.apache.hadoop.hive.serde2.objectinspector.primitive.JavaDoubleObjectInspector.get(JavaDoubleObjectInspector.java:40)
at org.apache.hadoop.hive.serde2.SerDeUtils.buildJSONString(SerDeUtils.java:259)
at org.apache.hadoop.hive.serde2.SerDeUtils.buildJSONString(SerDeUtils.java:307)
at org.apache.hadoop.hive.serde2.SerDeUtils.buildJSONString(SerDeUtils.java:354)
at org.apache.hadoop.hive.serde2.SerDeUtils.buildJSONString(SerDeUtils.java:354)
at org.apache.hadoop.hive.serde2.SerDeUtils.buildJSONString(SerDeUtils.java:354)
at org.apache.hadoop.hive.serde2.SerDeUtils.getJSONString(SerDeUtils.java:220)
at org.apache.hadoop.hive.ql.exec.MapOperator.process(MapOperator.java:667)
at org.apache.hadoop.hive.ql.exec.ExecMapper.map(ExecMapper.java:141)
at org.apache.hadoop
这是我用来定义 SerDe 表的架构:
CREATE EXTERNAL TABLE gh_raw (
coordinates struct <
coordinates: array <double>,
type: string>,
created_at string,
entities struct <
hashtags: array <struct <text: string>>,
media: array <struct <
display_url: string,
expanded_url: string,
media_url: string,
media_url_https: string,
sizes: struct <
large: struct <
h: int,
resize: string,
w: int>,
medium: struct <
h: int,
resize: string,
w: int>,
small: struct <
h: int,
resize: string,
w: int>,
thumb: struct <
h: int,
resize: string,
w: int>>,
type: string,
url: string>>,
urls: array <struct <
display_url: string,
expanded_url: string,
url: string>>,
user_mentions: array <struct <
id: int,
name: string,
screen_name: string>>>,
geo struct <
coordinates: array <double>,
type: string>,
id_str string,
in_reply_to_screen_name string,
in_reply_to_status_id_str string,
in_reply_to_user_id_str string,
place struct <
attributes: struct <
locality: string,
region: string,
street_address: string>,
bounding_box: struct <
coordinates: array <array <array <double>>>,
type: string>,
country: string,
country_code: string,
full_name: string,
name: string,
place_type: string,
url: string>,
possibly_sensitive boolean,
retweeted_status struct <
coordinates: struct <
coordinates: array <double>,
type: string>,
created_at: string,
entities: struct <
hashtags: array <struct <
text: string>>,
media: array <struct <
display_url: string,
expanded_url: string,
media_url: string,
media_url_https: string,
sizes: struct <
large: struct <
h: int,
resize: string,
w: int>,
medium: struct <
h: int,
resize: string,
w: int>,
small: struct <
h: int,
resize: string,
w: int>,
thumb: struct <
h: int,
resize: string,
w: int>>,
type: string,
url: string>>,
urls: array <struct <
display_url: string,
expanded_url: string,
url: string>>,
user_mentions: array <struct <
id: int,
name: string,
screen_name: string>>>,
favorited: boolean,
geo: struct <
coordinates: array <double>,
type: string>,
id_str: string,
in_reply_to_screen_name: string,
in_reply_to_status_id_str: string,
in_reply_to_user_id_str: string,
place: struct <
attributes: struct <
locality: string,
region: string,
street_address: string
>,
bounding_box: struct <
coordinates: array <array <array <double>>>,
type: string>,
country: string,
country_code: string,
full_name: string,
name: string,
place_type: string,
url: string>,
possibly_sensitive: boolean,
scopes: struct <
followers: boolean>,
source: string,
text: string,
truncated: boolean,
user: struct <
contributors_enabled: boolean,
created_at: string,
default_profile: boolean,
default_profile_image: boolean,
description: string,
favourites_count: int,
followers_count: int,
friends_count: int,
geo_enabled: boolean,
id: int,
id_str: string,
is_translator: boolean,
lang: string,
listed_count: int,
`location`: string,
name: string,
profile_background_color: string,
profile_background_image_url: string,
profile_background_image_url_https: string,
profile_background_tile: boolean,
profile_banner_url: string,
profile_image_url: string,
profile_image_url_https: string,
profile_link_color: string,
profile_sidebar_border_color: string,
profile_sidebar_fill_color: string,
profile_text_color: string,
profile_use_background_image: boolean,
protected: boolean,
screen_name: string,
statuses_count: int,
time_zone: string,
url: string,
utc_offset: int,
verified: boolean>>,
source string,
text string,
truncated boolean,
user struct <
contributors_enabled: boolean,
created_at: string,
default_profile: boolean,
default_profile_image: boolean,
description: string,
favourites_count: int,
followers_count: int,
friends_count: int,
geo_enabled: boolean,
id: int,
id_str: string,
is_translator: boolean,
lang: string,
listed_count: int,
`location`: string,
name: string,
profile_background_color: string,
profile_background_image_url: string,
profile_background_image_url_https: string,
profile_background_tile: boolean,
profile_banner_url: string,
profile_image_url: string,
profile_image_url_https: string,
profile_link_color: string,
profile_sidebar_border_color: string,
profile_sidebar_fill_color: string,
profile_text_color: string,
profile_use_background_image: boolean,
protected: boolean,
screen_name: string,
statuses_count: int,
time_zone: string,
url: string,
utc_offset: int,
verified: boolean>
)
ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe'
LOCATION '/user/ahanna/gh_raw';
我认为这在查找一组坐标或边界框时会崩溃。
我认为这是我正在使用的 JSON SerDe 的一个错误,但我不确定。我已经从头开始编译了我正在使用的那个,有人说他们已经解决了这个问题,但是不行:https://github.com/brndnmtthws/Hive-JSON-Serde
【问题讨论】:
标签: java json hadoop hive cloudera