-- Register Data Normalisation Module [DNM] BDQ Hive UDF Jar
ADD JAR <Directory path>/dnm.hive.${project.version}.jar;
-- Provide alias to UDF class (optional). String in quotes represent class names needed for this job to run.
-- Advanced Transformer is implemented as a UDF (User Defined function). Hence it processes one row at a time and generates a map of key value pairs for each row.
CREATE TEMPORARY FUNCTION advanceTransform as 'com.pb.bdq.dnm.process.hive.advancetransformer.AdvanceTransformerUDF';
-- Set rule
set rule='{"rules":[{"extractionType":"TableData", "source":"address", "nonExtractedData":"address_1", "extractedData":"address_2", "tokenizationCharacters":"", "tableName":"Street Suffix Abbreviations", "multipleTermLookup":false, "tokenize":true, "extract":"ExtractTerm", "includeTermWith":"ExtractedData", "wordsToExtract":2}]}';
-- Set Reference Directory. This must be a local path on cluster machines and must be present on each node of the cluster at the same path.
set refdir='/home/hadoop/reference/';
-- set header
set header ='AccountDescription,Address';
-- Execute Query on the desired table, to display the job output on console. This query returns a map of key value pairs containing output fields for each row.
SELECT bar.ret["AdvancedTransformTermIdentified"],
bar.ret["accountdescription"],
bar.ret["address"],
bar.ret["address_1"]
FROM (
SELECT advanceTransform(${hiveconf:rule}, ${hiveconf:refdir}, ${hiveconf:header}, accountdescription, address)
AS ret
FROM advxformX
) bar;
-- Query to dump output data to a file
INSERT OVERWRITE LOCAL DIRECTORY '/home/hadoop/AdvXformer/' row format delimited FIELDS TERMINATED BY ',' lines terminated by '\n' STORED AS TEXTFILE
SELECT bar.ret["AdvancedTransformTermIdentified"],
bar.ret["accountdescription"],
bar.ret["address"],
bar.ret["address_1"]
FROM (
SELECT advanceTransform(${hiveconf:rule}, ${hiveconf:refdir}, ${hiveconf:header}, accountdescription, address)
AS ret
FROM advxformX
) bar;
--sample input data
+----------------------------------+---------------------+-----------------------+
| AdvancedTransformTermIdentified | accountdescription | address |
+----------------------------------+---------------------+-----------------------+
| Yes | | 400 E M0 St Apt 1405 |
| Yes | | 190 E 72nd St |
+----------------------------------+---------------------+-----------------------+
--sample output data
+----------------------------------+---------------------+-----------------------+--------------------+
| AdvancedTransformTermIdentified | accountdescription | address | address_1 |
+----------------------------------+---------------------+-----------------------+--------------------+
| Yes | | 400 E M0 St Apt 1405 | 400 E M0 Apt 1405 |
| Yes | | 190 E 72nd St | 190 E 72nd |
+----------------------------------+---------------------+-----------------------+--------------------+