Sample Hive Script

Reference data placed on local node

-- Register Universal Name Module [UNM] BDQ Hive UDF Jar 
ADD JAR <Directory path>/unm.hive.${project.version}.jar;

-- Provide alias to UDF class (optional).
 String in quotes represent class names needed for this job to run.
-- Open Name Parser is implemented as a UDF (User Defined function). 
Hence it processes one row at a time and generates a map of key value pairs for each row.
CREATE TEMPORARY FUNCTION opennameparser as 'com.pb.bdq.unm.process.hive.opennameparser.OpenNameParserUDF';

-- set rule
set hivevar:rule='{"name":"name", "culture":"", "splitConjoinedNames":false,
"shortcutThreshold":0, "parseNaturalOrderPersonalNames":false, 
"naturalOrderPersonalNamesPriority":1,"parseReverseOrderPersonalNames":false,
"reverseOrderPersonalNamesPriority":2, "parseConjoinedNames":false, 
"naturalOrderConjoinedPersonalNamesPriority":3, "reverseOrderConjoinedPersonalNamesPriority":4,
"parseBusinessNames":false, "businessNamesPriority":5}';

-- Set Reference Directory. This must be a local path on cluster machines and must 
be present on each node of the cluster at the same path.
set hivevar:refereceDataDetails='{"referenceDataPathLocation":
"LocaltoDataNodes","dataDir":"/home/data/referenceData"}';

-- set header
set hivevar:header='inputrecordid,Name,nametype';

set hive.fetch.task.conversion=none;
-- Execute Query on the desired table, to display the job output on console.
 This query returns a map of key value pairs containing output fields for each row.
select adTable.adid["Name"], adTable.adid["NameScore"], adTable.adid["CultureCode"] 
from (select opennameparser(${hivevar:rule}, ${hivevar:refereceDataDetails}, ${hivevar:header},
 inputrecordid, name, nametype) as tmp1 from nameparser) as 
tmp LATERAL VIEW explode(tmp1) adTable AS adid;


-- Query to dump output data to a file
INSERT OVERWRITE LOCAL DIRECTORY '/home/hadoop/opennameparser/' row format 
delimited FIELDS TERMINATED BY ',' lines terminated by '\n' STORED AS TEXTFILE 
select adTable.adid["Name"], adTable.adid["NameScore"], adTable.adid["CultureCode"]
 from (select opennameparser(${hivevar:rule}, ${hivevar:refereceDataDetails}, ${hivevar:header},
 inputrecordid, name, nametype) as tmp1 from nameparser)
 as tmp LATERAL VIEW explode(tmp1) adTable AS adid;


--sample input data
+----------------------------------+--------------------------+-----------------------+
| inputrecordid		      | name		       |      nametype         |
+----------------------------------+--------------------------+-----------------------+
| 1                                |JOHN VAN DER LINDEN-JONES | Simple Name           |
| 2                                |RYAN JOHN SMITH           | Simple Name   	 |
+----------------------------------+--------------------------+-----------------------+

--sample output data
+----------------------------+--------------+---------------------+
| Name  		       | NameScore    |      CultureCode    |
+----------------------------+--------------+---------------------+
| JOHN VAN DER LINDEN-JONES  |   75         |      True           |
| RYAN JOHN SMITH            |   100        |      True           |
+----------------------------+-----+------------------------------+

Reference data placed on HDFS and downloaded on local nodes for jobs

-- Register Universal Name Module [UNM] BDQ Hive UDF Jar 
ADD JAR <Directory path>/unm.hive.${project.version}.jar;

-- Provide alias to UDF class (optional). String in 
quotes represent class names needed for this job to run.
-- Open Name Parser is implemented as a UDF (User Defined function). 
Hence it processes one row at a time and generates a map of key value pairs for each row.
CREATE TEMPORARY FUNCTION opennameparser as 'com.pb.bdq.unm.process.hive.opennameparser.OpenNameParserUDF';

-- set rule
set hivevar:rule='{"name":"name", "culture":"", "splitConjoinedNames":false,
 "shortcutThreshold":0, "parseNaturalOrderPersonalNames":false, 
"naturalOrderPersonalNamesPriority":1, "parseReverseOrderPersonalNames":false,
 "reverseOrderPersonalNamesPriority":2, "parseConjoinedNames":false, 
"naturalOrderConjoinedPersonalNamesPriority":3, "reverseOrderConjoinedPersonalNamesPriority":4,
 "parseBusinessNames":false, "businessNamesPriority":5}';

-- Set Reference Directory. This must be a local path on cluster machines and
 must be present on each node of the cluster at the same path.
set hivevar:refereceDataDetails='{"referenceDataPathLocation":"HDFS","dataDir"
:"/home/data/dm/referenceData","dataDownloader":{"dataDownloader":"HDFS","localFSRepository":"/local/download"}}';

-- set header
set hivevar:header='inputrecordid,Name,nametype';

set hive.fetch.task.conversion=none;
-- Execute Query on the desired table, to display the job output
 on console. This query returns a map of key value pairs containing output fields for each row.
select adTable.adid["Name"], adTable.adid["NameScore"], adTable.adid["CultureCode"] 
from (select opennameparser(${hivevar:rule}, ${hivevar:refereceDataDetails},
 ${hivevar:header}, inputrecordid, name, nametype) as tmp1 from nameparser) 
as tmp LATERAL VIEW explode(tmp1) adTable AS adid;


-- Query to dump output data to a file
INSERT OVERWRITE LOCAL DIRECTORY '/home/hadoop/opennameparser/' row format 
delimited FIELDS TERMINATED BY ',' lines terminated by '\n' STORED AS TEXTFILE 
select adTable.adid["Name"], adTable.adid["NameScore"], adTable.adid["CultureCode"]
 from (select opennameparser(${hivevar:rule}, ${hivevar:refereceDataDetails},
 ${hivevar:header},inputrecordid, name, nametype) as tmp1 from nameparser) 
as tmp LATERAL VIEW explode(tmp1) adTable AS adid;

--sample input data
+----------------------------------+--------------------------+-----------------------+
| inputrecordid		      | name                     |      nametype         |
+----------------------------------+--------------------------+-----------------------+
| 1                                |JOHN VAN DER LINDEN-JONES |     Simple Name       |
| 2                                |RYAN JOHN SMITH           |     Simple Name       |
+----------------------------------+--------------------------+-----------------------+

--sample output data
+----------------------------+--------------+---------------------+
| Name  		       | NameScore    |      CultureCode    |
+----------------------------+--------------+---------------------+
| JOHN VAN DER LINDEN-JONES  |   75         |       True          |
| RYAN JOHN SMITH            |   100        |       True          |
+----------------------------+-----+------------------------------+

Reference data placed on HDFS and downloaded to working directory for jobs

-- Register Universal Name Module [UNM] BDQ Hive UDF Jar 
ADD JAR <Directory path>/unm.hive.${project.version}.jar;

-- Provide alias to UDF class (optional). String in 
quotes represent class names needed for this job to run.
-- Open Name Parser is implemented as a UDF (User Defined function).
Hence it processes one row at a time and generates a map of key value pairs for each row.
CREATE TEMPORARY FUNCTION opennameparser as 'com.pb.bdq.unm.process.hive.opennameparser.OpenNameParserUDF';

-- HDFS Reference data- via Distributed Cache DC- unarchive
ADD FILES hdfs://<HOST>:<PORT>/home/hduser/referenceData/;

--HDFS reference data- via Distributed cache -archive form of reference data 
--ADD ARCHIVE hdfs://<HOST>:<PORT>/home/hduser/referenceData.zip;

-- set rule
set hivevar:rule='{"name":"name", "culture":"", "splitConjoinedNames":false,
 "shortcutThreshold":0, "parseNaturalOrderPersonalNames":false,
 "naturalOrderPersonalNamesPriority":1,"parseReverseOrderPersonalNames":false,
 "reverseOrderPersonalNamesPriority":2, "parseConjoinedNames":false,
 "naturalOrderConjoinedPersonalNamesPriority":3, "reverseOrderConjoinedPersonalNamesPriority":4,
 "parseBusinessNames":false, "businessNamesPriority":5}';

-- Set Reference Data details .

--reference data details, can be added in zip or unarchive form, dataDir symbolises reference data 
set hivevar:refereceDataDetails='{"referenceDataPathLocation":"HDFS","dataDir":"./referenceData",
"dataDownloader":{"dataDownloader":"DC"}}';

-- below format for archive form
--set hivevar:refereceDataDetails='{"referenceDataPathLocation":"HDFS","dataDir":
"./referenceData.zip","dataDownloader":{"dataDownloader":"DC"}}';

-- set header
set hivevar:header='inputrecordid,Name,nametype';

set hive.fetch.task.conversion=none;
-- Execute Query on the desired table, to display the job
 output on console. This query returns a map of key value
 pairs containing output fields for each row.
select adTable.adid["Name"], adTable.adid["NameScore"], adTable.adid["CultureCode"] 
from (select opennameparser(${hivevar:rule}, ${hivevar:refereceDataDetails},
 ${hivevar:header}, inputrecordid, name, nametype) as tmp1 from nameparser)
 as tmp LATERAL VIEW explode(tmp1) adTable AS adid;


-- Query to dump output data to a file
INSERT OVERWRITE LOCAL DIRECTORY '/home/hadoop/opennameparser/'
row format delimited FIELDS TERMINATED BY ','
 lines terminated by '\n' STORED AS TEXTFILE 
select adTable.adid["Name"], adTable.adid["NameScore"], adTable.adid["CultureCode"]
from (select opennameparser(${hivevar:rule}, ${hivevar:refereceDataDetails},
${hivevar:header}, inputrecordid, name, nametype) as tmp1 from nameparser) 
as tmp LATERAL VIEW explode(tmp1) adTable AS adid;



--sample input data
+----------------------------------+--------------------------+-----------------------+
| inputrecordid		      | name                     |      nametype         |
+----------------------------------+--------------------------+-----------------------+
| 1                                |JOHN VAN DER LINDEN-JONES |     Simple Name       |
| 2                                |RYAN JOHN SMITH           |     Simple Name       |
+----------------------------------+--------------------------+-----------------------+

--sample output data
+----------------------------+--------------+---------------------+
| Name  		       | NameScore    |      CultureCode    |
+----------------------------+--------------+---------------------+
| JOHN VAN DER LINDEN-JONES  |   75         |      	True    |
| RYAN JOHN SMITH            |   100        |       True          |
+----------------------------+-----+------------------------------+