ローカル ノードに配置されたリファレンス データ
-- Register Universal Name Module [UNM] BDQ Hive UDF Jar
ADD JAR <Directory path>/unm.hive.${project.version}.jar;
-- Provide alias to UDF class (optional).
String in quotes represent class names needed for this job to run.
-- Open Name Parser is implemented as a UDF (User Defined function).
Hence it processes one row at a time and generates a map of key value pairs for each row.
CREATE TEMPORARY FUNCTION opennameparser as 'com.pb.bdq.unm.process.hive.opennameparser.OpenNameParserUDF';
-- set rule
set hivevar:rule='{"name":"name", "culture":"", "splitConjoinedNames":false,
"shortcutThreshold":0, "parseNaturalOrderPersonalNames":false,
"naturalOrderPersonalNamesPriority":1,"parseReverseOrderPersonalNames":false,
"reverseOrderPersonalNamesPriority":2, "parseConjoinedNames":false,
"naturalOrderConjoinedPersonalNamesPriority":3, "reverseOrderConjoinedPersonalNamesPriority":4,
"parseBusinessNames":false, "businessNamesPriority":5}';
-- Set Reference Directory. This must be a local path on cluster machines and must
be present on each node of the cluster at the same path.
set hivevar:refereceDataDetails='{"referenceDataPathLocation":
"LocaltoDataNodes","dataDir":"/home/data/referenceData"}';
-- set header
set hivevar:header='inputrecordid,Name,nametype';
set hive.fetch.task.conversion=none;
-- Execute Query on the desired table, to display the job output on console.
This query returns a map of key value pairs containing output fields for each row.
select adTable.adid["Name"], adTable.adid["NameScore"], adTable.adid["CultureCode"]
from (select opennameparser(${hivevar:rule}, ${hivevar:refereceDataDetails}, ${hivevar:header},
inputrecordid, name, nametype) as tmp1 from nameparser) as
tmp LATERAL VIEW explode(tmp1) adTable AS adid;
-- Query to dump output data to a file
INSERT OVERWRITE LOCAL DIRECTORY '/home/hadoop/opennameparser/' row format
delimited FIELDS TERMINATED BY ',' lines terminated by '\n' STORED AS TEXTFILE
select adTable.adid["Name"], adTable.adid["NameScore"], adTable.adid["CultureCode"]
from (select opennameparser(${hivevar:rule}, ${hivevar:refereceDataDetails}, ${hivevar:header},
inputrecordid, name, nametype) as tmp1 from nameparser)
as tmp LATERAL VIEW explode(tmp1) adTable AS adid;
--sample input data
+----------------------------------+--------------------------+-----------------------+
| inputrecordid | name | nametype |
+----------------------------------+--------------------------+-----------------------+
| 1 |JOHN VAN DER LINDEN-JONES | Simple Name |
| 2 |RYAN JOHN SMITH | Simple Name |
+----------------------------------+--------------------------+-----------------------+
--sample output data
+----------------------------+--------------+---------------------+
| Name | NameScore | CultureCode |
+----------------------------+--------------+---------------------+
| JOHN VAN DER LINDEN-JONES | 75 | True |
| RYAN JOHN SMITH | 100 | True |
+----------------------------+-----+------------------------------+
HDFS に配置され、ジョブ用のローカル ノードにダウンロードされたリファレンス データ
-- Register Universal Name Module [UNM] BDQ Hive UDF Jar
ADD JAR <Directory path>/unm.hive.${project.version}.jar;
-- Provide alias to UDF class (optional). String in
quotes represent class names needed for this job to run.
-- Open Name Parser is implemented as a UDF (User Defined function).
Hence it processes one row at a time and generates a map of key value pairs for each row.
CREATE TEMPORARY FUNCTION opennameparser as 'com.pb.bdq.unm.process.hive.opennameparser.OpenNameParserUDF';
-- set rule
set hivevar:rule='{"name":"name", "culture":"", "splitConjoinedNames":false,
"shortcutThreshold":0, "parseNaturalOrderPersonalNames":false,
"naturalOrderPersonalNamesPriority":1, "parseReverseOrderPersonalNames":false,
"reverseOrderPersonalNamesPriority":2, "parseConjoinedNames":false,
"naturalOrderConjoinedPersonalNamesPriority":3, "reverseOrderConjoinedPersonalNamesPriority":4,
"parseBusinessNames":false, "businessNamesPriority":5}';
-- Set Reference Directory. This must be a local path on cluster machines and
must be present on each node of the cluster at the same path.
set hivevar:refereceDataDetails='{"referenceDataPathLocation":"HDFS","dataDir"
:"/home/data/dm/referenceData","dataDownloader":{"dataDownloader":"HDFS","localFSRepository":"/local/download"}}';
-- set header
set hivevar:header='inputrecordid,Name,nametype';
set hive.fetch.task.conversion=none;
-- Execute Query on the desired table, to display the job output
on console. This query returns a map of key value pairs containing output fields for each row.
select adTable.adid["Name"], adTable.adid["NameScore"], adTable.adid["CultureCode"]
from (select opennameparser(${hivevar:rule}, ${hivevar:refereceDataDetails},
${hivevar:header}, inputrecordid, name, nametype) as tmp1 from nameparser)
as tmp LATERAL VIEW explode(tmp1) adTable AS adid;
-- Query to dump output data to a file
INSERT OVERWRITE LOCAL DIRECTORY '/home/hadoop/opennameparser/' row format
delimited FIELDS TERMINATED BY ',' lines terminated by '\n' STORED AS TEXTFILE
select adTable.adid["Name"], adTable.adid["NameScore"], adTable.adid["CultureCode"]
from (select opennameparser(${hivevar:rule}, ${hivevar:refereceDataDetails},
${hivevar:header},inputrecordid, name, nametype) as tmp1 from nameparser)
as tmp LATERAL VIEW explode(tmp1) adTable AS adid;
--sample input data
+----------------------------------+--------------------------+-----------------------+
| inputrecordid | name | nametype |
+----------------------------------+--------------------------+-----------------------+
| 1 |JOHN VAN DER LINDEN-JONES | Simple Name |
| 2 |RYAN JOHN SMITH | Simple Name |
+----------------------------------+--------------------------+-----------------------+
--sample output data
+----------------------------+--------------+---------------------+
| Name | NameScore | CultureCode |
+----------------------------+--------------+---------------------+
| JOHN VAN DER LINDEN-JONES | 75 | True |
| RYAN JOHN SMITH | 100 | True |
+----------------------------+-----+------------------------------+