Sample Hive Script

Reference data placed on local node

-- Register Data Normalization Modue [dnm] BDQ Hive UDF Jar 
ADD JAR <Directory path>/dnm.hive.${project.version}.jar;

-- Provide alias to UDF class (optional). 
String in quotes represent class names needed for this job to run.
-- Table Lookup is implemented as a UDF (User Defined function).
 Hence it processes one row at a time and generates a map of key value pairs for each row.
CREATE TEMPORARY FUNCTION tablelookup as 
'com.pb.bdq.dnm.process.hive.tablelookup.TableLookUpUDF';

-- Set rule 
set hivevar:rule='{"rules":[{"action":"Standardize", "source":"CityCode",
 "tableName":"State Name Abbreviations", "lookupMultipleWordTerms":false,
 "lookupIndividualTermsWithinField":false, "destination":"CityCode"}]}';

-- Set Reference Directory. This must be a local path on cluster machines 
and must be present on each node of the cluster at the same path.
set hivevar:refereceDataDetails='{"referenceDataPathLocation":"LocaltoDataNodes",
"dataDir":"/home/data/referenceData"}';

set hive.fetch.task.conversion=none;

-- set header
set hivevar:header ='AccountDescription,Address,ApartmentNumber,CityCode';


-- Execute Query on the desired table, to display the job output on console.
 This query returns a map of key value pairs containing output fields for each row.

SELECT bar.ret["StandardizationTermIdentified"],
	bar.ret["accountdescription"],
	bar.ret["address"],
	bar.ret["apartmentnumber"],
	bar.ret["citycode"] 
FROM (
	SELECT tablelookup(${hivevar:rule}, ${hivevar:refereceDataDetails}, ${hivevar:header},
       accountdescription, address, apartmentnumber, citycode) 
	AS ret 
	FROM citizen_data
	) bar;

	
-- Query to dump output data to a file

INSERT OVERWRITE LOCAL DIRECTORY '/home/hadoop/TableLookup/' row format
 delimited FIELDS TERMINATED BY ',' lines terminated by '\n' STORED AS TEXTFILE 
SELECT bar.ret["StandardizationTermIdentified"],
	bar.ret["accountdescription"],
	bar.ret["address"],
	bar.ret["apartmentnumber"],
	bar.ret["citycode"] 
FROM (
	SELECT tablelookup(${hiveconf:rule}, ${hiveconf:refereceDataDetails}, 
       ${hiveconf:header}, accountdescription, address, apartmentnumber, citycode) 
	AS ret 
	FROM citizen_data
	) bar;

--Sample input data
--+----------------------------------+-----------------------+-------------------------------+------------------------+
--| citizen_data.accountdescription  | citizen_data.address  | citizen_data.apartmentnumber  | citizen_data.citycode |  
--+----------------------------------+-----------------------+-------------------------------+------------------------+
--|  				    | 400 E M0 St Apt 1405  |                               | NY                     |    
--|                                  | 190 E 72nd St         |                               | NY                     |   
--|                                  | 1381 3rd Ave Apt 4    | 4                             | TTYYY                  | 
--+----------------------------------+-----------------------+-------------------------------+------------------------+   

--sample output data
--+------------------------------+--------------------------+-----------------------+-----------------------+----------+   
--|StandardizationTermIdentified |       accountdescription | address               | apartmentnumber       | citycode|
--+------------------------------+--------------------------+-----------------------+-----------------------+----------+ 
--| yes			     |  		          | 400 E M0 St Apt 1405  |                       | NEW YORK |                        
--| yes	     		|                          | 190 E 72nd St         |                       | NEW YORK |                       
--| yes 			    |                          | 1381 3rd Ave Apt 4    | 4                     | NEW YORK |
--+------------------------------+--------------------------+-----------------------+-----------------------+----------+
			

Reference data placed on HDFS and downloaded on local nodes for jobs

-- Register Data Normalization Modue [dnm] BDQ Hive UDF Jar 
ADD JAR <Directory path>/dnm.hive.${project.version}.jar;

-- Provide alias to UDF class (optional). String in quotes represent class names needed for this job to run.
-- Table Lookup is implemented as a UDF (User Defined function). 
Hence it processes one row at a time and generates a map of key value pairs for each row.
CREATE TEMPORARY FUNCTION tablelookup as 
'com.pb.bdq.dnm.process.hive.tablelookup.TableLookUpUDF';

-- Set rule 
set hivevar:rule='{"rules":[{"action":"Standardize", "source":"CityCode",
 "tableName":"State Name Abbreviations", "lookupMultipleWordTerms":false, 
"lookupIndividualTermsWithinField":false, "destination":"CityCode"}]}';

-- Set reference data details for Download manager, paas dataDir where 
data resides in HDFS and localFS path to download the data and dataDownloader as HDFS
set hivevar:refereceDataDetails='{"referenceDataPathLocation":"HDFS",
"dataDir":"/home/data/dm/referenceData","dataDownloader":{"dataDownloader":"HDFS",
"localFSRepository":"/local/download"}}';

set hive.fetch.task.conversion=none;

-- set header
set hivevar:header ='AccountDescription,Address,ApartmentNumber,CityCode';


-- Execute Query on the desired table, to display the job output on console. 
This query returns a map of key value pairs containing output fields for each row.

SELECT bar.ret["StandardizationTermIdentified"],
	bar.ret["accountdescription"],
	bar.ret["address"],
	bar.ret["apartmentnumber"],
	bar.ret["citycode"] 
FROM (
	SELECT tablelookup(${hivevar:rule}, ${hivevar:refereceDataDetails},
 ${hivevar:header}, accountdescription, address, apartmentnumber, citycode) 
	AS ret 
	FROM citizen_data
	) bar;

	
-- Query to dump output data to a file

INSERT OVERWRITE LOCAL DIRECTORY '/home/hadoop/TableLookup/' 
row format delimited FIELDS TERMINATED BY ',' lines terminated by '\n' STORED AS TEXTFILE 
SELECT bar.ret["StandardizationTermIdentified"],
	bar.ret["accountdescription"],
	bar.ret["address"],
	bar.ret["apartmentnumber"],
	bar.ret["citycode"] 
FROM (
	SELECT tablelookup(${hiveconf:rule}, ${hiveconf:refereceDataDetails},
 ${hiveconf:header}, accountdescription, address, apartmentnumber, citycode) 
	AS ret 
	FROM citizen_data
	) bar;

--Sample input data
--+----------------------------------+-----------------------+-------------------------------+------------------------+
--| citizen_data.accountdescription  | citizen_data.address  | citizen_data.apartmentnumber  | citizen_data.citycode |  
--+----------------------------------+-----------------------+-------------------------------+------------------------+
--|  				    | 400 E M0 St Apt 1405  |                               | NY                     |    
--|                                  | 190 E 72nd St         |                               | NY                     |   
--|                                  | 1381 3rd Ave Apt 4    | 4                             | TTYYY                  | 
--+----------------------------------+-----------------------+-------------------------------+------------------------+   

--sample output data
--+------------------------------+--------------------------+-----------------------+-----------------------+----------+   
--|StandardizationTermIdentified |       accountdescription | address  	      | apartmentnumber       | citycode|
--+------------------------------+--------------------------+-----------------------+-----------------------+----------+ 
--|yes			      |  		          | 400 E M0 St Apt 1405  |                       | NEW YORK |                        
--|yes	     		 |                          | 190 E 72nd St         |                       | NEW YORK |                       
--|yes 			     |                          | 1381 3rd Ave Apt 4    | 4                     | NEW YORK |
--+------------------------------+--------------------------+-----------------------+-----------------------+----------+

Reference data placed on HDFS and downloaded to working directory for jobs

-- Register Data Normalization Modue [dnm] BDQ Hive UDF Jar 
ADD JAR <Directory path>/dnm.hive.${project.version}.jar;

-- Provide alias to UDF class (optional). 
String in quotes represent class names needed for this job to run.
-- Table Lookup is implemented as a UDF (User Defined function).
 Hence it processes one row at a time and generates a map of key value pairs for each row.
CREATE TEMPORARY FUNCTION tablelookup as
 'com.pb.bdq.dnm.process.hive.tablelookup.TableLookUpUDF';

-- HDFS Reference data- via Distributed Cache DC- unarchive
ADD FILES hdfs://<HOST>:<PORT>/home/hduser/referenceData/;

--HDFS reference data- via Distributed cache -archive form of reference data 
--ADD ARCHIVE hdfs://<HOST>:<PORT>/home/hduser/referenceData.zip;
-- Set rule 
set hivevar:rule='{"rules":[{"action":"Standardize", 
"source":"CityCode", "tableName":"State Name Abbreviations",
 "lookupMultipleWordTerms":false, "lookupIndividualTermsWithinField":false, 
"destination":"CityCode"}]}';

-- Set Reference Data details .

--reference data details, can be added in zip 
or unarchive form, dataDir symbolises reference data 
set hivevar:refereceDataDetails='{"referenceDataPathLocation":"HDFS","dataDir":"./referenceData",
"dataDownloader":{"dataDownloader":"DC"}}';

-- below format for archive form
--set hivevar:refereceDataDetails='{"referenceDataPathLocation":"HDFS",
"dataDir":"./referenceData.zip","dataDownloader":{"dataDownloader":"DC"}}';

set hive.fetch.task.conversion=none;

-- set header
set hivevar:header ='AccountDescription,Address,ApartmentNumber,CityCode';


-- Execute Query on the desired table, to display the job output on 
console. This query returns a map of key value pairs containing output fields for each row.

SELECT bar.ret["StandardizationTermIdentified"],
	bar.ret["accountdescription"],
	bar.ret["address"],
	bar.ret["apartmentnumber"],
	bar.ret["citycode"] 
FROM (
	SELECT tablelookup(${hivevar:rule}, ${hivevar:refereceDataDetails}, 
${hivevar:header}, accountdescription, address, apartmentnumber, citycode) 
	AS ret 
	FROM citizen_data
	) bar;

	
-- Query to dump output data to a file

INSERT OVERWRITE LOCAL DIRECTORY '/home/hadoop/TableLookup/' row format
 delimited FIELDS TERMINATED BY ',' lines terminated by '\n' STORED AS TEXTFILE 
SELECT bar.ret["StandardizationTermIdentified"],
	bar.ret["accountdescription"],
	bar.ret["address"],
	bar.ret["apartmentnumber"],
	bar.ret["citycode"] 
FROM (
	SELECT tablelookup(${hiveconf:rule}, ${hiveconf:refereceDataDetails},
 ${hiveconf:header}, accountdescription, address, apartmentnumber, citycode) 
	AS ret 
	FROM citizen_data
	) bar;

--Sample input data
--+----------------------------------+-----------------------+-------------------------------+------------------------+
--| citizen_data.accountdescription  | citizen_data.address  | citizen_data.apartmentnumber  | citizen_data.citycode |  
--+----------------------------------+-----------------------+-------------------------------+------------------------+
--|  				    | 400 E M0 St Apt 1405  |                               | NY                     |    
--|                                  | 190 E 72nd St         |                               | NY                     |   
--|                                  | 1381 3rd Ave Apt 4    | 4                             | TTYYY                  | 
--+----------------------------------+-----------------------+-------------------------------+------------------------+   

--sample output data
--+------------------------------+--------------------------+-----------------------+-----------------------+----------+   
--|StandardizationTermIdentified | accountdescription       | address  	      | apartmentnumber       | citycode|
--+------------------------------+--------------------------+-----------------------+-----------------------+----------+ 
--|yes			      |  		          | 400 E M0 St Apt 1405  |                       | NEW YORK |                        
--|yes	     		 |                          | 190 E 72nd St         |                       | NEW YORK |                       
--|yes 			     |                          | 1381 3rd Ave Apt 4    | 4                     | NEW YORK |
--+------------------------------+--------------------------+-----------------------+-----------------------+----------+