ローカル ノードに配置されたリファレンス データ
-- Register Data Normalization Modue [dnm] BDQ Hive UDF Jar
ADD JAR <Directory path>/dnm.hive.${project.version}.jar;
-- Provide alias to UDF class (optional).
String in quotes represent class names needed for this job to run.
-- Table Lookup is implemented as a UDF (User Defined function).
Hence it processes one row at a time and generates a map of key value pairs for each row.
CREATE TEMPORARY FUNCTION tablelookup as
'com.pb.bdq.dnm.process.hive.tablelookup.TableLookUpUDF';
-- Set rule
set hivevar:rule='{"rules":[{"action":"Standardize", "source":"CityCode",
"tableName":"State Name Abbreviations", "lookupMultipleWordTerms":false,
"lookupIndividualTermsWithinField":false, "destination":"CityCode"}]}';
-- Set Reference Directory. This must be a local path on cluster machines
and must be present on each node of the cluster at the same path.
set hivevar:refereceDataDetails='{"referenceDataPathLocation":"LocaltoDataNodes",
"dataDir":"/home/data/referenceData"}';
set hive.fetch.task.conversion=none;
-- set header
set hivevar:header ='AccountDescription,Address,ApartmentNumber,CityCode';
-- Execute Query on the desired table, to display the job output on console.
This query returns a map of key value pairs containing output fields for each row.
SELECT bar.ret["StandardizationTermIdentified"],
bar.ret["accountdescription"],
bar.ret["address"],
bar.ret["apartmentnumber"],
bar.ret["citycode"]
FROM (
SELECT tablelookup(${hivevar:rule}, ${hivevar:refereceDataDetails}, ${hivevar:header},
accountdescription, address, apartmentnumber, citycode)
AS ret
FROM citizen_data
) bar;
-- Query to dump output data to a file
INSERT OVERWRITE LOCAL DIRECTORY '/home/hadoop/TableLookup/' row format
delimited FIELDS TERMINATED BY ',' lines terminated by '\n' STORED AS TEXTFILE
SELECT bar.ret["StandardizationTermIdentified"],
bar.ret["accountdescription"],
bar.ret["address"],
bar.ret["apartmentnumber"],
bar.ret["citycode"]
FROM (
SELECT tablelookup(${hiveconf:rule}, ${hiveconf:refereceDataDetails},
${hiveconf:header}, accountdescription, address, apartmentnumber, citycode)
AS ret
FROM citizen_data
) bar;
--Sample input data
--+----------------------------------+-----------------------+-------------------------------+------------------------+
--| citizen_data.accountdescription | citizen_data.address | citizen_data.apartmentnumber | citizen_data.citycode |
--+----------------------------------+-----------------------+-------------------------------+------------------------+
--| | 400 E M0 St Apt 1405 | | NY |
--| | 190 E 72nd St | | NY |
--| | 1381 3rd Ave Apt 4 | 4 | TTYYY |
--+----------------------------------+-----------------------+-------------------------------+------------------------+
--sample output data
--+------------------------------+--------------------------+-----------------------+-----------------------+----------+
--|StandardizationTermIdentified | accountdescription | address | apartmentnumber | citycode|
--+------------------------------+--------------------------+-----------------------+-----------------------+----------+
--| yes | | 400 E M0 St Apt 1405 | | NEW YORK |
--| yes | | 190 E 72nd St | | NEW YORK |
--| yes | | 1381 3rd Ave Apt 4 | 4 | NEW YORK |
--+------------------------------+--------------------------+-----------------------+-----------------------+----------+
HDFS に配置され、ジョブ用のローカル ノードにダウンロードされたリファレンス データ
-- Register Data Normalization Modue [dnm] BDQ Hive UDF Jar
ADD JAR <Directory path>/dnm.hive.${project.version}.jar;
-- Provide alias to UDF class (optional). String in quotes represent class names needed for this job to run.
-- Table Lookup is implemented as a UDF (User Defined function).
Hence it processes one row at a time and generates a map of key value pairs for each row.
CREATE TEMPORARY FUNCTION tablelookup as
'com.pb.bdq.dnm.process.hive.tablelookup.TableLookUpUDF';
-- Set rule
set hivevar:rule='{"rules":[{"action":"Standardize", "source":"CityCode",
"tableName":"State Name Abbreviations", "lookupMultipleWordTerms":false,
"lookupIndividualTermsWithinField":false, "destination":"CityCode"}]}';
-- Set reference data details for Download manager, paas dataDir where
data resides in HDFS and localFS path to download the data and dataDownloader as HDFS
set hivevar:refereceDataDetails='{"referenceDataPathLocation":"HDFS",
"dataDir":"/home/data/dm/referenceData","dataDownloader":{"dataDownloader":"HDFS",
"localFSRepository":"/local/download"}}';
set hive.fetch.task.conversion=none;
-- set header
set hivevar:header ='AccountDescription,Address,ApartmentNumber,CityCode';
-- Execute Query on the desired table, to display the job output on console.
This query returns a map of key value pairs containing output fields for each row.
SELECT bar.ret["StandardizationTermIdentified"],
bar.ret["accountdescription"],
bar.ret["address"],
bar.ret["apartmentnumber"],
bar.ret["citycode"]
FROM (
SELECT tablelookup(${hivevar:rule}, ${hivevar:refereceDataDetails},
${hivevar:header}, accountdescription, address, apartmentnumber, citycode)
AS ret
FROM citizen_data
) bar;
-- Query to dump output data to a file
INSERT OVERWRITE LOCAL DIRECTORY '/home/hadoop/TableLookup/'
row format delimited FIELDS TERMINATED BY ',' lines terminated by '\n' STORED AS TEXTFILE
SELECT bar.ret["StandardizationTermIdentified"],
bar.ret["accountdescription"],
bar.ret["address"],
bar.ret["apartmentnumber"],
bar.ret["citycode"]
FROM (
SELECT tablelookup(${hiveconf:rule}, ${hiveconf:refereceDataDetails},
${hiveconf:header}, accountdescription, address, apartmentnumber, citycode)
AS ret
FROM citizen_data
) bar;
--Sample input data
--+----------------------------------+-----------------------+-------------------------------+------------------------+
--| citizen_data.accountdescription | citizen_data.address | citizen_data.apartmentnumber | citizen_data.citycode |
--+----------------------------------+-----------------------+-------------------------------+------------------------+
--| | 400 E M0 St Apt 1405 | | NY |
--| | 190 E 72nd St | | NY |
--| | 1381 3rd Ave Apt 4 | 4 | TTYYY |
--+----------------------------------+-----------------------+-------------------------------+------------------------+
--sample output data
--+------------------------------+--------------------------+-----------------------+-----------------------+----------+
--|StandardizationTermIdentified | accountdescription | address | apartmentnumber | citycode|
--+------------------------------+--------------------------+-----------------------+-----------------------+----------+
--|yes | | 400 E M0 St Apt 1405 | | NEW YORK |
--|yes | | 190 E 72nd St | | NEW YORK |
--|yes | | 1381 3rd Ave Apt 4 | 4 | NEW YORK |
--+------------------------------+--------------------------+-----------------------+-----------------------+----------+