サンプル Hive スクリプト

-- Register Advance Matching Module[AMM] Hive UDF jar
ADD JAR <Directory path>/amm.hive.${project.version}.jar;

-- Provide alias to UDF class (optional). 
String in quotes represent class names needed for this job to run.
CREATE TEMPORARY FUNCTION rowid as 'com.pb.bdq.hive.common.RowIDGeneratorUDF';

-- This rowid is needed by Intraflow Match to maintain
 the order of rows while creating groups. This is a UDF (User Defined Function)
and associates an incremental unique integer number to each row of the data.

CREATE TEMPORARY FUNCTION intraMatch as 'com.pb.bdq.amm.process.hive.intraflow.IntraMatchUDAF';
-- Intra Flow is implemented as a UDAF (User Defined Aggregation function).
 It processes one group of rows at a time and generates the result for that group of rows

-- Disable map side aggregation 
set hive.map.aggr = false;

-- Set the rule using configuration property 'hivevar:rule'
set hivevar:rule='{"type":"Parent",
"children":[{"type":"Child", "matchWhenNotTrue":false, "threshold":80.0, "weight":0,
"algorithms":[{"name":"EditDistance", "weight":0, "options":null},
{"name":"Metaphone", "weight":0, "options":null}],
"scoringMethod":"Maximum", "missingDataMethod":"IgnoreBlanks",
 "crossMatchField":[], "suspectField":"firstname", "candidateField":null},
{"type":"Child", "matchWhenNotTrue":false, "threshold":80.0, "weight":0, 
"algorithms":[{"name":"KeyboardDistance", "weight":0, "options":null},
{"name":"Metaphone3", "weight":0, "options":null}], "scoringMethod":"Maximum", 
"missingDataMethod":"IgnoreBlanks", "crossMatchField":[], "suspectField":"lastname", "candidateField":null}],
"matchingMethod":"AllTrue", "scoringMethod":"Average", "missingDataMethod":"IgnoreBlanks", 
"name":"NameData", "matchWhenNotTrue":false, "threshold":100,"weight":0}';


-- Set header(along with id field alias used in query) using configuration property 'hivevar:header'
set hivevar:header='firstname,lastname,matchkey,middlename,id';

-- Set the express match column (optional)
set hivevar:expresscolumn='matchkey';

-- Set sort field name to the alias used in the query,
 using the configuration property 'hivevar:sortfield'
set hivevar:sortfield='id';

-- Set sort collection number option for unique records using 
configuration property 'hivevar:UniqueCollectionNumber'
set hivevar:UniqueCollectionNumber='false';

-- Execute Query on the desired table. The query uses a UDF rowid, 
which must be present in the query to maintain the ordering of the data while reading.
-- Intra Match returns a list of map containing <key=value> pairs. 
Each map in the list corresponds to a row in the group. 
The below query explodes that list of map and fetches fields from map by keys.

SELECT innerresult.record["MatchRecordType"],
	innerresult.record["MatchScore"],
	innerresult.record["CollectionNumber"],
	innerresult.record["ExpressMatched"],
	innerresult.record["firstname"],
	innerresult.record["lastname"],
	innerresult.record["matchkey"],
	innerresult.record["middlename"]
	FROM (
		SELECT  intraMatch( ${hivevar:rule},
				${hivevar:sortfield},
				${hivevar:expresscolumn},
				${hivevar:UniqueCollectionNumber},
				${hivevar:header},
				innerRowID.firstname, 
				innerRowID.lastname, 
				innerRowID.matchkey, 
				innerRowID.middlename, 
				innerRowID.id
		) AS matchgroup  
		FROM (
			SELECT  firstname, lastname, matchkey, middlename, rowid(*) 
			AS id 
			FROM customer_data
			) innerRowID 
		GROUP BY matchkey 
	) AS innerResult 
	LATERAL VIEW explode(innerResult.matchgroup) innerresult AS record ;

-- Query to dump output to a file

INSERT OVERWRITE LOCAL DIRECTORY '/home/hadoop/IntraFlow/'
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' collection items terminated by '||' map keys terminated by ':'
SELECT innerresult.record["MatchRecordType"],
	innerresult.record["MatchScore"],
	innerresult.record["CollectionNumber"],
	innerresult.record["ExpressMatched"],
	innerresult.record["firstname"],
	innerresult.record["lastname"],
	innerresult.record["matchkey"],
	innerresult.record["middlename"]
	FROM (
		SELECT  intraMatch(innerRowID.firstname, 
				innerRowID.lastname, 
				innerRowID.matchkey, 
				innerRowID.middlename, 
				innerRowID.id
		) AS matchgroup 
		FROM (
				SELECT  firstname, lastname, matchkey, middlename, rowid(*) 
				AS id 
				FROM customer_data
			) innerRowID 
		GROUP BY matchkey 
		) AS innerResult 
	LATERAL VIEW explode(innerResult.matchgroup) innerresult AS record ;


--sample input data
--+-------------+-------------+---------------+--------------+
--| firstname   | lastname    | middlename    | matchkey     |
--+-------------+-------------+---------------+--------------+
--| Steven      | Aaen        | LYRIC         | AAE          |
--| DEBRA       | AALMO       | BOATMAN       | AAE          |              
--| MARY        | AARON       | ROLLING MEADOW| AAE          |
--+-------------+-------------+---------------+--------------+

--sample output data
--+-----------+---------+------------+---------+---------------+----------------+--------------+----------+
--| firstname | lastname|middlename  | matchkey|MatchRecordType|CollectionNumber|ExpressMatched|MatchScore| 
--+-----------+---------+------------+---------+---------------+----------------+--------------+----------+
--| Steven    | Aaen    | LYRIC      | AAE     |    S	       |         0-0-1  |     Y        |     0    |
--| DEBRA     | AALMO   | BOATMAN    | AAE     |    D 	      |         0-0-1  |     Y        |    100   | 
--| MARY      | AARON   | ROLLING MEA| AAE     |    D  	     |         0-0-1  |     Y        |    100   |
--+-----------+---------+------------+---------+---------------+----------------+--------------+----------+