-- Register Advance Matching Module[AMM] Hive UDF jar
ADD JAR <Directory path>/amm.hive.${project.version}.jar;
-- Provide alias to UDF class (optional).
String in quotes represent class names needed for this job to run.
CREATE TEMPORARY FUNCTION matchkeygenerator as
'com.pb.bdq.amm.process.hive.matchkeygenerator.MatchKeyGeneratorUDF';
-- Match Key Generator is implemented as a UDF (User Defined function).
It processes one row at a time and generates a map of match keys for each row.
-- Set rule and header
set hivevar:rule='{"matchKeys":[{"expressMatchKey":false,"matchKeyField":"MatchKey1",
"rules":[{"algorithm":"Soundex","field":"busniessname","startPosition":1,"length":0,
"active":true,"sortInput":null,"removeNoiseCharacters":false}]},
{"expressMatchKey":false,"matchKeyField":"MatchKey2",
"rules":[{"algorithm":"Koeln","field":"busniessname","startPosition":1,"length":0,
"active":true,"sortInput":null,"removeNoiseCharacters":false}]}]}';
set hivevar:header='busniessname,recordid';
-- Execute query on the desired table to display the job output on console.
This query returns a map of key value for each row containing matchkeys as per rule passed.
SELECT busniessname, recordid, bar.ret["MatchKey1"] AS MatchKey1, bar.ret["MatchKey2"]
AS MatchKey2 FROM (SELECT *, matchkeygenerator (${hivevar:rule}, ${hivevar:header},
busniessname, recordid) AS ret FROM cust ) bar;
-- Query to dump output to a directory in file system
INSERT OVERWRITE LOCAL DIRECTORY '/home/hadoop/MatchKey/' row format
delimited FIELDS TERMINATED BY ',' MAP FIELDS TERMINATED BY ':' COLLECTION ITEMS TERMINATED
BY '|' LINES TERMINATED BY '\n' STORED AS TEXTFILE
SELECT busniessname, recordid, bar.ret["MatchKey1"] AS MatchKey1,
bar.ret["MatchKey2"] AS MatchKey2 FROM (
SELECT *, matchkeygenerator (${hiveconf:rule}, ${hiveconf:header},
busniessname, recordid) AS ret FROM cust ) bar;
--Sample data in input table customer
--+------------------------------------------+----------------+--+
--| cust.busniessname | cust.recordid |
--+------------------------------------------+----------------+--+
--| Internal Revenue Service | 0 |
--| Juan F Vera-Monroig | 1 |
--| Leonardo Pagan-Reyes | 2 |
--| Academia San Joaquin Colegios/Academias | 3 |
--| Nereida Portalatin-Padua | 4 |
--+------------------------------------------+----------------+--+
--Sample output for input query
+------------------------------------------+-----------+------------+-------------------+--+
| busniessname | recordid | matchkey1 | matchkey2 |
+------------------------------------------+-----------+------------+-------------------+--+
| Internal Revenue Service | 0 | I536 | 0627657368738 |
| Juan F Vera-Monroig | 1 | J511 | 063376674 |
| Leonardo Pagan-Reyes | 2 | L563 | 567214678 |
| Academia San Joaquin Colegios/Academias | 3 | A235 | 0426864645484268 |
| Nereida Portalatin-Padua | 4 | N631 | 67217252612 |
+------------------------------------------+-----------+------------+-------------------+--+