Sample Hive Script

Reference data placed on local node

-- Register Data Normalisation Module [DNM] BDQ Hive UDF Jar 
ADD JAR <Directory path>/dnm.hive.${project.version}.jar;

-- Register the grammar for open parser
ADD FILE /home/hadoop/testing/openparser/hive/testdata/Open_parser_grammer_tc1.txt;

-- Provide alias to UDF class (optional). String in quotes
represent class names needed for this job to run.
-- Open Parser is implemented as a UDF(User Defined function).
 Hence it processes one row at a time and generates a map of key value pairs for each row.

CREATE TEMPORARY FUNCTION openparser as 'com.pb.bdq.dnm.process.hive.openparser.OpenParserUDF';

set hive.fetch.task.conversion=minimal;

set hivevar:header='InputKeyValue,addressline';

-- Set Reference Directory. This must be a local path on 
cluster machines and must be present on each node of the cluster at the same path.
set hivevar:refereceDataDetails='{"referenceDataPathLocation":
"LocaltoDataNodes","dataDir":"/home/data/referenceData"}';

-- Set rule with grammar file name
set hivevar:rule='{"grammar":null,"domain":null,"defaultCulture":null,
"culture":null,"returnMultipleParsedRecords":false,"debug":false,
"grammerFilePath":"Open_parser_grammer_tc1.txt"}';

--Execute Query on desired table to sump the output to local dir.
INSERT OVERWRITE LOCAL DIRECTORY '/home/hadoop/testing/openparser/hive/testdata/output' 
row format delimited FIELDS TERMINATED BY '|' 
lines terminated by '\n' 
STORED AS TEXTFILE
SELECT coalesce(tmp2.record ["InputKeyValue"], '')
	,coalesce(tmp2.record ["addressline"], '')
	,coalesce(tmp2.record ["ApartmentNumber"], '')
	,coalesce(tmp2.record ["HouseNumber"], '')
	,coalesce(tmp2.record ["IsParsed"], '')
	,coalesce(tmp2.record ["LeadingDirectional"], '')
	,coalesce(tmp2.record ["ParserScore"], '')
	,coalesce(tmp2.record ["POBox"], '')
	,coalesce(tmp2.record ["PrivateMailbox"], '')
	,coalesce(tmp2.record ["RRHC"], '')
	,coalesce(tmp2.record ["RuleID"], '')
	,coalesce(tmp2.record ["StreetName"], '')
	,coalesce(tmp2.record ["StreetSuffix"], '')
	,coalesce(tmp2.record ["TrailingDirectional"], '')
FROM (
	SELECT openparser(${hivevar:rule},${hivevar:refereceDataDetails}, 
       ${hivevar:header}, InputKeyValue, addressline) AS mygp
	FROM openparserinputtable1
	) AS addressgroup LATERAL VIEW explode(addressgroup.mygp) tmp2 AS record;

Reference data placed on HDFS and downloaded on local nodes for jobs

-- Register Data Normalisation Module [DNM] BDQ Hive UDF Jar 
ADD JAR <Directory path>/dnm.hive.${project.version}.jar;

-- Register the grammar for open parser
ADD FILE /home/hadoop/testing/openparser/hive/testdata/Open_parser_grammer_tc1.txt;

-- Provide alias to UDF class (optional). 
String in quotes represent class names needed for this job to run.
-- Open Parser is implemented as a UDF(User Defined function). 
Hence it processes one row at a time and generates a map of key value pairs for each row.

CREATE TEMPORARY FUNCTION openparser as 'com.pb.bdq.dnm.process.hive.openparser.OpenParserUDF';

set hive.fetch.task.conversion=minimal;

set hivevar:header='InputKeyValue,addressline';

-- Set reference data details for Download manager, paas 
dataDir where data resides in HDFS and localFS path to download the data and dataDownloader as HDFS
set hivevar:refereceDataDetails='{"referenceDataPathLocation":
"HDFS","dataDir":"/home/data/dm/referenceData","dataDownloader":
{"dataDownloader":"HDFS","localFSRepository":"/local/download"}}';

set hive.fetch.task.conversion=none;

-- Set rule with grammar file name
set hivevar:rule='{"grammar":null,"domain":null,"defaultCulture":null,
"culture":null,"returnMultipleParsedRecords":false,"debug":false,
"grammerFilePath":"Open_parser_grammer_tc1.txt"}';

--Execute Query on desired table to sump the output to local dir.
INSERT OVERWRITE LOCAL DIRECTORY '/home/hadoop/testing/openparser/hive/testdata/output' 
row format delimited FIELDS TERMINATED BY '|' 
lines terminated by '\n' 
STORED AS TEXTFILE
SELECT coalesce(tmp2.record ["InputKeyValue"], '')
	,coalesce(tmp2.record ["addressline"], '')
	,coalesce(tmp2.record ["ApartmentNumber"], '')
	,coalesce(tmp2.record ["HouseNumber"], '')
	,coalesce(tmp2.record ["IsParsed"], '')
	,coalesce(tmp2.record ["LeadingDirectional"], '')
	,coalesce(tmp2.record ["ParserScore"], '')
	,coalesce(tmp2.record ["POBox"], '')
	,coalesce(tmp2.record ["PrivateMailbox"], '')
	,coalesce(tmp2.record ["RRHC"], '')
	,coalesce(tmp2.record ["RuleID"], '')
	,coalesce(tmp2.record ["StreetName"], '')
	,coalesce(tmp2.record ["StreetSuffix"], '')
	,coalesce(tmp2.record ["TrailingDirectional"], '')
FROM (
	SELECT openparser(${hivevar:rule},${hivevar:refereceDataDetails},
       ${hivevar:header}, InputKeyValue, addressline) AS mygp
	FROM openparserinputtable1
	) AS addressgroup LATERAL VIEW explode(addressgroup.mygp) tmp2 AS record;


Reference data placed on HDFS and downloaded to working directory for jobs

-- Register Data Normalisation Module [DNM] BDQ Hive UDF Jar 
ADD JAR <Directory path>/dnm.hive.${project.version}.jar;

-- Register the grammar for open parser
ADD FILE /home/hadoop/testing/openparser/hive/testdata/Open_parser_grammer_tc1.txt;


-- HDFS Reference data- via Distributed Cache DC- unarchive
ADD FILES hdfs://<HOST>:<PORT>/home/hduser/referenceData/;

--HDFS reference data- via Distributed cache -archive form of reference data 
--ADD ARCHIVE hdfs://<HOST>:<PORT>/home/hduser/referenceData.zip;

-- Provide alias to UDF class (optional). 
String in quotes represent class names needed for this job to run.
-- Open Parser is implemented as a UDF(User Defined function).
 Hence it processes one row at a time and generates a map of key value pairs for each row.

CREATE TEMPORARY FUNCTION openparser as
 'com.pb.bdq.dnm.process.hive.openparser.OpenParserUDF';

set hive.fetch.task.conversion=minimal;

set hivevar:header='InputKeyValue,addressline';


-- Set Reference Data details .

--reference data details, can be added in zip or
 unarchive form, dataDir symbolises reference data 
set hivevar:refereceDataDetails='{"referenceDataPathLocation":"HDFS",
"dataDir":"./referenceData","dataDownloader":{"dataDownloader":"DC"}}';

-- below format for archive form
--set hivevar:refereceDataDetails='{"referenceDataPathLocation":
"HDFS","dataDir":"./referenceData.zip","dataDownloader":{"dataDownloader":"DC"}}';

-- Set rule with grammar file name
set hivevar:rule='{"grammar":null,"domain":null,"defaultCulture":null,
"culture":null,"returnMultipleParsedRecords":false,"debug":
false,"grammerFilePath":"Open_parser_grammer_tc1.txt"}';

--Execute Query on desired table to sump the output to local dir.
INSERT OVERWRITE LOCAL DIRECTORY '/home/hadoop/testing/openparser/hive/testdata/output' 
row format delimited FIELDS TERMINATED BY '|' 
lines terminated by '\n' 
STORED AS TEXTFILE
SELECT coalesce(tmp2.record ["InputKeyValue"], '')
	,coalesce(tmp2.record ["addressline"], '')
	,coalesce(tmp2.record ["ApartmentNumber"], '')
	,coalesce(tmp2.record ["HouseNumber"], '')
	,coalesce(tmp2.record ["IsParsed"], '')
	,coalesce(tmp2.record ["LeadingDirectional"], '')
	,coalesce(tmp2.record ["ParserScore"], '')
	,coalesce(tmp2.record ["POBox"], '')
	,coalesce(tmp2.record ["PrivateMailbox"], '')
	,coalesce(tmp2.record ["RRHC"], '')
	,coalesce(tmp2.record ["RuleID"], '')
	,coalesce(tmp2.record ["StreetName"], '')
	,coalesce(tmp2.record ["StreetSuffix"], '')
	,coalesce(tmp2.record ["TrailingDirectional"], '')
FROM (
	SELECT openparser(${hivevar:rule},${hivevar:refereceDataDetails}, 
       ${hivevar:header}, InputKeyValue, addressline) AS mygp
	FROM openparserinputtable1
	) AS addressgroup LATERAL VIEW explode(addressgroup.mygp) tmp2 AS record;