-- Register Advance Matching Module[AMM] Hive UDF jar
ADD JAR <Directory path>/amm.hive.${project.version}.jar;
-- Provide alias to UDF class (optional).
String in quotes represent class names needed for this job to run.
CREATE TEMPORARY FUNCTION rowid as 'com.pb.bdq.hive.common.RowIDGeneratorUDF';
-- This rowid is needed by Intraflow Match to maintain
the order of rows while creating groups. This is a UDF (User Defined Function)
and associates an incremental unique integer number to each row of the data.
CREATE TEMPORARY FUNCTION intraMatch as 'com.pb.bdq.amm.process.hive.intraflow.IntraMatchUDAF';
-- Intra Flow is implemented as a UDAF (User Defined Aggregation function).
It processes one group of rows at a time and generates the result for that group of rows
-- Disable map side aggregation
set hive.map.aggr = false;
-- Set the rule using configuration property 'hivevar:rule'
set hivevar:rule='{"type":"Parent",
"children":[{"type":"Child", "matchWhenNotTrue":false, "threshold":80.0, "weight":0,
"algorithms":[{"name":"EditDistance", "weight":0, "options":null},
{"name":"Metaphone", "weight":0, "options":null}],
"scoringMethod":"Maximum", "missingDataMethod":"IgnoreBlanks",
"crossMatchField":[], "suspectField":"firstname", "candidateField":null},
{"type":"Child", "matchWhenNotTrue":false, "threshold":80.0, "weight":0,
"algorithms":[{"name":"KeyboardDistance", "weight":0, "options":null},
{"name":"Metaphone3", "weight":0, "options":null}], "scoringMethod":"Maximum",
"missingDataMethod":"IgnoreBlanks", "crossMatchField":[], "suspectField":"lastname", "candidateField":null}],
"matchingMethod":"AllTrue", "scoringMethod":"Average", "missingDataMethod":"IgnoreBlanks",
"name":"NameData", "matchWhenNotTrue":false, "threshold":100,"weight":0}';
-- Set header(along with id field alias used in query) using configuration property 'hivevar:header'
set hivevar:header='firstname,lastname,matchkey,middlename,id';
-- Set the express match column (optional)
set hivevar:expresscolumn='matchkey';
-- Set sort field name to the alias used in the query,
using the configuration property 'hivevar:sortfield'
set hivevar:sortfield='id';
-- Set sort collection number option for unique records using
configuration property 'hivevar:UniqueCollectionNumber'
set hivevar:UniqueCollectionNumber='false';
-- Execute Query on the desired table. The query uses a UDF rowid,
which must be present in the query to maintain the ordering of the data while reading.
-- Intra Match returns a list of map containing <key=value> pairs.
Each map in the list corresponds to a row in the group.
The below query explodes that list of map and fetches fields from map by keys.
SELECT innerresult.record["MatchRecordType"],
innerresult.record["MatchScore"],
innerresult.record["CollectionNumber"],
innerresult.record["ExpressMatched"],
innerresult.record["firstname"],
innerresult.record["lastname"],
innerresult.record["matchkey"],
innerresult.record["middlename"]
FROM (
SELECT intraMatch( ${hivevar:rule},
${hivevar:sortfield},
${hivevar:expresscolumn},
${hivevar:UniqueCollectionNumber},
${hivevar:header},
innerRowID.firstname,
innerRowID.lastname,
innerRowID.matchkey,
innerRowID.middlename,
innerRowID.id
) AS matchgroup
FROM (
SELECT firstname, lastname, matchkey, middlename, rowid(*)
AS id
FROM customer_data
) innerRowID
GROUP BY matchkey
) AS innerResult
LATERAL VIEW explode(innerResult.matchgroup) innerresult AS record ;
-- Query to dump output to a file
INSERT OVERWRITE LOCAL DIRECTORY '/home/hadoop/IntraFlow/'
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' collection items terminated by '||' map keys terminated by ':'
SELECT innerresult.record["MatchRecordType"],
innerresult.record["MatchScore"],
innerresult.record["CollectionNumber"],
innerresult.record["ExpressMatched"],
innerresult.record["firstname"],
innerresult.record["lastname"],
innerresult.record["matchkey"],
innerresult.record["middlename"]
FROM (
SELECT intraMatch(innerRowID.firstname,
innerRowID.lastname,
innerRowID.matchkey,
innerRowID.middlename,
innerRowID.id
) AS matchgroup
FROM (
SELECT firstname, lastname, matchkey, middlename, rowid(*)
AS id
FROM customer_data
) innerRowID
GROUP BY matchkey
) AS innerResult
LATERAL VIEW explode(innerResult.matchgroup) innerresult AS record ;
--sample input data
--+-------------+-------------+---------------+--------------+
--| firstname | lastname | middlename | matchkey |
--+-------------+-------------+---------------+--------------+
--| Steven | Aaen | LYRIC | AAE |
--| DEBRA | AALMO | BOATMAN | AAE |
--| MARY | AARON | ROLLING MEADOW| AAE |
--+-------------+-------------+---------------+--------------+
--sample output data
--+-----------+---------+------------+---------+---------------+----------------+--------------+----------+
--| firstname | lastname|middlename | matchkey|MatchRecordType|CollectionNumber|ExpressMatched|MatchScore|
--+-----------+---------+------------+---------+---------------+----------------+--------------+----------+
--| Steven | Aaen | LYRIC | AAE | S | 0-0-1 | Y | 0 |
--| DEBRA | AALMO | BOATMAN | AAE | D | 0-0-1 | Y | 100 |
--| MARY | AARON | ROLLING MEA| AAE | D | 0-0-1 | Y | 100 |
--+-----------+---------+------------+---------+---------------+----------------+--------------+----------+