サンプル Hive スクリプト

-- Register Advance Matching Module[AMM] Hive UDF jar
ADD JAR <Directory path>/amm.hive.${project.version}.jar;

-- Provide alias to UDF class (optional).
String in quotes represent class names needed for this job to run.

CREATE TEMPORARY FUNCTION rowid as 'com.pb.bdq.hive.common.RowIDGeneratorUDF';

-- This rowid is needed by Transactional Match to maintain the 
order of rows while creating groups. This is a UDF (User Defined Function) and 
associates an incremental unique integer number to each row of the data.

CREATE TEMPORARY FUNCTION transactionalMatch as
 'com.pb.bdq.amm.process.hive.transactional.TransactionalMatchUDAF';

-- Transactional Match is implemented as a UDAF (User Defined Aggregation function). 
It processes one group of rows at a time and generates the result for that group of rows.

-- Disable map side aggregation 
set hive.map.aggr = false;

-- Set the rule using configuration property 'hivevar:rule'
set hivevar:rule='{"type":"Parent", "children":[{"type":"Child", "matchWhenNotTrue":false,
 "threshold":80.0, "weight":0, "algorithms":[{"name":"EditDistance", "weight":0, "options":null},
{"name":"Metaphone","weight":0,"options":null}],
"scoringMethod":"Maximum", "missingDataMethod":"IgnoreBlanks", "crossMatchField":[],
 "suspectField":"firstname", "candidateField":null},
{"type":"Child", "matchWhenNotTrue":false, "threshold":80.0,
 "weight":0, "algorithms":[{"name":"KeyboardDistance","weight":0,"options":null},
{"name":"Metaphone3","weight":0,"options":null}],
"scoringMethod":"Maximum", "missingDataMethod":"IgnoreBlanks",
"crossMatchField":[], "suspectField":"lastname", "candidateField":null}],
 "matchingMethod":"AllTrue", "scoringMethod":"Average", "missingDataMethod":"IgnoreBlanks",
 "name":"NameData", "matchWhenNotTrue":false, "threshold":100, "weight":0}';

-- Set header(along with id field alias used in query) using configuration property 'hivevar:header'
set hivevar:header='name,firstname,lastname,matchkey,middlename,recordid,id';

-- Set sort field name to the alias used in the query,
 using the configuration property 'hivevar:sort'
set hivevar:sort='id';

-- Set sort collection number option for unique records using configuration property
 'hivevar:uniquecolumnsreturn'. The default value is false.
set hivevar:uniquecolumnsreturn='true';

-- Execute Query on the desired table. The query uses a UDF rowid,
 which must be present in the query to maintain the ordering of the data while reading.
-- Transactional Match returns a list of map containing <key=value> pairs.
 Each map in the list corresponds to a row in the group. The below query 
explodes that list of map and fetches fields from map by keys.

SELECT tmp2.record["MatchRecordType"],
	tmp2.record["MatchScore"],
	tmp2.record["HasDuplicate"],
	tmp2.record["name"],
	tmp2.record["firstname"],
	tmp2.record["lastname"],
	tmp2.record["matchkey"],
	tmp2.record["middlename"],
	tmp2.record["recordid"]
FROM (
	SELECT  transactionalMatch(innerRowID.name, innerRowID.firstname,
 innerRowID.lastname, innerRowID.matchkey, innerRowID.middlename, 
 innerRowID.recordid, innerRowID.id
	) AS matchgroup  
	FROM (
		SELECT name, firstname, lastname, matchkey, middlename, 
             recordid, rowid(name, firstname, lastname, matchkey, middlename, 
             recordid) AS id FROM customer_data
		) innerRowID 
	GROUP BY matchkey 
) As innerResult 
LATERAL VIEW explode(innerResult.matchgroup) tmp2 as record ;


-- Query to dump output to a file

INSERT OVERWRITE LOCAL DIRECTORY '/home/hadoop/transmatch/' ROW FORMAT DELIMITED
 FIELDS TERMINATED BY ',' collection items terminated by '||' map keys terminated by ':'
SELECT tmp2.record["MatchRecordType"],
	tmp2.record["MatchScore"],
	tmp2.record["HasDuplicate"],
	tmp2.record["name"],
	tmp2.record["firstname"],
	tmp2.record["lastname"],
	tmp2.record["matchkey"],
	tmp2.record["middlename"],
	tmp2.record["recordid"]
FROM (
	SELECT  transactionalMatch(${hivevar:rule},
			${hivevar:sort}, 
			${hivevar:uniquecolumnsreturn},
			${hivevar:header},
			innerRowID.name, 
			innerRowID.firstname, 
			innerRowID.lastname,
			innerRowID.matchkey,
			innerRowID.middlename,
			innerRowID.recordid,
			innerRowID.id) as matchgroup  
	FROM (
		SELECT name, firstname, lastname, matchkey, middlename, 
recordid, rowid(name, firstname, lastname, matchkey, middlename, recordid) AS id 
		FROM customer_data
		) innerRowID 
	GROUP BY matchkey ) As innerResult 
LATERAL VIEW explode(innerResult.matchgroup) tmp2 as record ;



--sample input data
--+-------------------+------------------------+-----------------------+-----------------------+-------------------------+-----------------------+--+
--| name              | firstname              | lastname              | matchkey              | middlename              | recordid              |
--+-------------------+------------------------+-----------------------+-----------------------+-------------------------+-----------------------+--+
--| ZORINA  ABDOOL    | ZORINA                 | ABDOOL                | Z                     |                         | 12                    |
--| ZULFIQAR  ALI     | ZULFIQAR               | ALI                   | Z                     |                         | 116                   |
--| ZACHARY  BENNETT  | ZACHARY                | BENNETT               | Z                     |                         | 515                   |
--| ZOHAR  BUERGER    | ZOHAR                  | BUERGER               | Z                     |                         | 889                   |
--+-------------------+------------------------+-----------------------+-----------------------+-------------------------+-----------------------+--+

--sample output data
--+----------------+---------------------+----------+------------+-----------+-----------------+------------+--------------+
--|name            |firstname | lastname | matchkey | middlename | recordid  | MatchRecordType | MatchScore | HasDuplicate |
--+----------------+----------+----------+----------+------------+-----------+-----------------+------------+--------------+
--|ZORINA  ABDOOL  |ZORINA    | ABDOOL   | Z        |            | 12        |      S	   |	0    |	Y      |
--|ZULFIQAR  ALI   |ZULFIQAR  | ALI      | Z        |            | 116       |      D          |	90   |	D      |
--|ZACHARY  BENNETT|ZACHARY   | BENNETT  | Z        |            | 515       |      D          |	91   |	D      |
--|ZOHAR  BUERGER  |ZOHAR     | BUERGER  | Z        |            | 889       |      D	   |	91   |	D      |
--+----------------+----------+---------------------+------------------------+-----------------+------------+--------------+