GP-1084 - fixed exception in Version Tracking correlator

Closes #1152
2024-12-06 11:01:53 +00:00 · 2021-06-30 11:10:23 -04:00 · 2021-06-30 11:10:23 -04:00 · 651d59c2df
commit 651d59c2df
parent 6962885c3e
2 changed files with 293 additions and 305 deletions
--- a/Ghidra/Features/VersionTracking/src/main/java/ghidra/feature/vt/api/correlator/program/VTAbstractReferenceProgramCorrelator.java
+++ b/Ghidra/Features/VersionTracking/src/main/java/ghidra/feature/vt/api/correlator/program/VTAbstractReferenceProgramCorrelator.java
@ -20,6 +20,8 @@ import static ghidra.feature.vt.api.correlator.program.VTAbstractReferenceProgra
 import java.util.*;
 import java.util.Map.Entry;

+import org.apache.commons.collections4.map.LazyMap;
+
 import generic.DominantPair;
 import generic.lsh.vector.LSHCosineVectorAccum;
 import generic.lsh.vector.VectorCompare;
@ -31,6 +33,7 @@ import ghidra.program.model.address.Address;
 import ghidra.program.model.address.AddressSetView;
 import ghidra.program.model.listing.*;
 import ghidra.program.model.symbol.*;
+import ghidra.util.datastruct.Counter;
 import ghidra.util.exception.CancelledException;
 import ghidra.util.task.TaskMonitor;

@ -44,9 +47,13 @@ public abstract class VTAbstractReferenceProgramCorrelator extends VTAbstractPro
 	private static final double DIFFERENTIAL = 0.2;
 	private static final double EQUALS_EPSILON = 0.00001;

+	private static final Comparator<VTMatchInfo> SCORE_COMPARATOR = (o1, o2) -> {
+		return o2.getSimilarityScore().compareTo(o1.getSimilarityScore());
+	};
+
 	private String correlatorName;
-	private HashMap<Address, LSHCosineVectorAccum> srcFuncAddresstoVectorMap;
-	private HashMap<Address, LSHCosineVectorAccum> destFuncAddresstoVectorMap;
+	private Map<Address, LSHCosineVectorAccum> srcVectorsByAddress;
+	private Map<Address, LSHCosineVectorAccum> destVectorsByAddress;

 	private Program sourceProgram;
 	private Program destinationProgram;
@ -55,13 +62,13 @@ public abstract class VTAbstractReferenceProgramCorrelator extends VTAbstractPro

 	/**
 	 * Correlator class constructor.
-	 * @param serviceProvider The {@code ServiceProvider}.
-	 * @param sourceProgram The source {@code Program}.
-	 * @param sourceAddressSet The {@code AddressSetView} for the source program.
-	 * @param destinationProgram The destination {@code Program}.
-	 * @param destinationAddressSet The {@code AddressSetView} for the destination program.
-	 * @param correlatorName The correlator name string passed from the factory.
-	 * @param options {@code ToolOptions}
+	 * @param serviceProvider the service provider
+	 * @param sourceProgram the source program
+	 * @param sourceAddressSet the source addresses to correlate
+	 * @param destinationProgram the destination program
+	 * @param destinationAddressSet the destination addresses to correlate
+	 * @param correlatorName the correlator name
+	 * @param options the tool options
 	 */
 	VTAbstractReferenceProgramCorrelator(ServiceProvider serviceProvider, Program sourceProgram,
 			AddressSetView sourceAddressSet, Program destinationProgram,
@ -84,81 +91,72 @@ public abstract class VTAbstractReferenceProgramCorrelator extends VTAbstractPro
 	}

 	/**
-	 * First generates the sourceDictionary from the source program and matchSet, 
-	 * then finds the destinations corresponding to the matchSet and the 
+	 * First generates the sourceDictionary from the source program and matchSet,
+	 * then finds the destinations corresponding to the matchSet and the
 	 * sourceDictionary using the preset similarity and confidence thresholds.
-	 *  
-	 * @param matchSet VTMatchSetDB containing all existing matches sorted into 
-	 * subsets corresponding to the generating correlators.
+	 * 
+	 * @param matchSet contains all existing matches
 	 * @param monitor the task monitor
-	 * @throws CancelledException the process cancellation exception
+	 * @throws CancelledException if cancelled
 	 */
 	@Override
 	protected void doCorrelate(VTMatchSet matchSet, TaskMonitor monitor) throws CancelledException {

-		double minbits = getOptions().getDouble(CONFIDENCE_THRESHOLD, CONFIDENCE_THRESHOLD_DEFAULT);
-		double similarity_threshold =
-			getOptions().getDouble(SIMILARITY_THRESHOLD, SIMILARITY_THRESHOLD_DEFAULT);
-
 		monitor.setMessage("Finding reference features");
 		extractReferenceFeatures(matchSet, monitor);

 		monitor.setMessage("Finding destination functions");
-		try {
-			findDestinations(matchSet, similarity_threshold, minbits, monitor);
-		}
-		catch (Exception e) {
-			throw new RuntimeException("problem with parallel decompiler", e);
-		}
+		findDestinations(matchSet, monitor);
 	}

 	/**
 	 * findDestinations updates matchSet with non-null VTMatchInfo members returned from transform.
-	 * For each of the entries in the destinationMap = {destMatchAddr:[list of source references]}, 
-	 * we test all pairs [list of source references] x [list of destination references] 
+	 * For each of the entries in the destinationMap = {destMatchAddr:[list of source references]},
+	 * we test all pairs [list of source references] x [list of destination references]
 	 * 
-	 * </br> 
+	 * </br>
 	 * Note: {@code destinationMap} is a class variable set by {@code extractReferenceFeatures}
 	 * 
-	 * @param matchSet The {@code VTMatchSet} for the current session (non-transitive).
-	 * @param similarityThreshold The {@code double} threshold passed to {@code transform}
-	 * @param minbits The {@code double} minbits value passed to {@code transform}.
-	 * @param monitor The {@code TaskMonitor} (non-transitive).
+	 * @param matchSet The {@code VTMatchSet} for the current session (non-transitive)
+	 * @param monitor task monitor
+	 * @throws CancelledException if cancelled
 	 */
-	protected void findDestinations(final VTMatchSet matchSet, final double similarityThreshold,
-			double minbits, final TaskMonitor monitor) {
+	private void findDestinations(VTMatchSet matchSet, TaskMonitor monitor)
+			throws CancelledException {

-		monitor.initialize(destFuncAddresstoVectorMap.size());
-		for (Entry<Address, LSHCosineVectorAccum> destEntry : destFuncAddresstoVectorMap.entrySet()) {
-			if (monitor.isCancelled()) {
-				return;
-			}
+		monitor.initialize(destVectorsByAddress.size());
+		Set<Entry<Address, LSHCosineVectorAccum>> destEntries =
+			destVectorsByAddress.entrySet();
+		for (Entry<Address, LSHCosineVectorAccum> destEntry : destEntries) {
+
+			monitor.checkCanceled();
 			monitor.incrementProgress(1);

-			// Get the function CONTAINING the ACCEPTED match destination address
+			// Get the function containing the ACCEPTED match destination address
 			Function destFunc = destinationListing.getFunctionAt(destEntry.getKey());
 			LSHCosineVectorAccum dstVector = destEntry.getValue();

 			// Get the set of possible matches, neighbors, in the SourceProgram
-			HashMap<Address, DominantPair<Double, VectorCompare>> srcNeighbors = new HashMap<>();
+			Map<Address, DominantPair<Double, VectorCompare>> srcNeighbors = new HashMap<>();

-			for (Entry<Address, LSHCosineVectorAccum> srcEntry : srcFuncAddresstoVectorMap.entrySet()) {
+			Set<Entry<Address, LSHCosineVectorAccum>> srcEntries =
+				srcVectorsByAddress.entrySet();
+			for (Entry<Address, LSHCosineVectorAccum> srcEntry : srcEntries) {
 				Address srcAddr = srcEntry.getKey();
 				LSHCosineVectorAccum srcVector = srcEntry.getValue();

-				VectorCompare veccompare = new VectorCompare();
-				Double similarity = dstVector.compare(srcVector, veccompare);
-
+				VectorCompare vectorCompare = new VectorCompare();
+				double similarity = dstVector.compare(srcVector, vectorCompare);
 				DominantPair<Double, VectorCompare> compareOut =
-					new DominantPair<>(similarity, veccompare);
+					new DominantPair<>(similarity, vectorCompare);

-				if (dstVector.compare(srcVector, veccompare) > 0) {
+				if (dstVector.compare(srcVector, vectorCompare) > 0) {
 					srcNeighbors.put(srcAddr, compareOut);
 				}
 			}

 			List<VTMatchInfo> members = transform(matchSet, destFunc, dstVector, srcNeighbors,
-				similarityThreshold, minbits, monitor);
+				monitor);

 			for (VTMatchInfo member : members) {
 				if (member != null) {
@ -170,34 +168,35 @@ public abstract class VTAbstractReferenceProgramCorrelator extends VTAbstractPro
 	}

 	/**
-	 * Scoring Mechanism: determines destination similarity and confidence for each 
-	 * of the sourceNeighbors and if similarity passes the threshold and confidence passes minbits, 
-	 * then VTMatchInfo will be created and added to the result.
-	 *  
-	 * @param matchSet - The returned {@code VTMatchSet} for this correlator. 
-	 * @param destinationFunction - A {@code Function} in the destination program that references an existing accepted match.
-	 * @param destinationVector - The destination function's feature vector.
-	 * @param neighbors - The set data for possible sourceFunction matches for destinationFunction.
-	 * @param similarityThreshold - The user defined similarity scoring threshold (expected to be between 0 and 1).
-	 * @param minbits - The user defined confidence threshold. 
-	 * @param monitor - {@code TaskMonitor} 
+	 * Scoring Mechanism: determines destination similarity and confidence for each of the
+	 * sourceNeighbors and if similarity and confidence pass the threshold, then VTMatchInfo will
+	 * be created and added to the result.
+	 * 
+	 * @param matchSet match set for this correlator
+	 * @param destinationFunction function in the destination program that references an existing accepted match
+	 * @param destinationVector the destination function's feature vector
+	 * @param neighbors the set data for possible sourceFunction matches for destinationFunction
+	 * @param monitor the monitor
 	 * @return {@code List<VTMatchInfo>} result
+	 * @throws CancelledException if cancelled
 	 */
 	private List<VTMatchInfo> transform(VTMatchSet matchSet, Function destinationFunction,
 			LSHCosineVectorAccum destinationVector,
-			HashMap<Address, DominantPair<Double, VectorCompare>> neighbors,
-			double similarityThreshold, double minbits, TaskMonitor monitor) {
+			Map<Address, DominantPair<Double, VectorCompare>> neighbors, TaskMonitor monitor)
+			throws CancelledException {

 		boolean refineResult = getOptions().getBoolean(REFINE_RESULTS, REFINE_RESULTS_DEFAULT);
+		double confidenceThreshold =
+			getOptions().getDouble(CONFIDENCE_THRESHOLD, CONFIDENCE_THRESHOLD_DEFAULT);
+		double similarityThreshold =
+			getOptions().getDouble(SIMILARITY_THRESHOLD, SIMILARITY_THRESHOLD_DEFAULT);

 		Address destinationAddress = destinationFunction.getEntryPoint();
 		int destinationLength = (int) destinationFunction.getBody().getNumAddresses();
 		List<VTMatchInfo> result = new ArrayList<>();

 		for (Entry<Address, DominantPair<Double, VectorCompare>> neighbor : neighbors.entrySet()) {
-			if (monitor.isCancelled()) {
-				break;
-			}
+			monitor.checkCanceled();

 			Address sourceAddr = neighbor.getKey();

@ -214,9 +213,10 @@ public abstract class VTAbstractReferenceProgramCorrelator extends VTAbstractPro
 				continue;
 			}

-			if (confidence < minbits) {
+			if (confidence < confidenceThreshold) {
 				continue;
 			}
+
 			confidence *= 10.0; // remove when getting rid of log10 stuff

 			VTMatchInfo match = new VTMatchInfo(matchSet);
@ -241,28 +241,20 @@ public abstract class VTAbstractReferenceProgramCorrelator extends VTAbstractPro
 		return result;
 	}

-	private static final Comparator<VTMatchInfo> SCORE_COMPARATOR = new Comparator<VTMatchInfo>() {
-		@Override
-		public int compare(VTMatchInfo o1, VTMatchInfo o2) {
-			return o2.getSimilarityScore().compareTo(o1.getSimilarityScore());
-		}
-	};
-
 	private List<VTMatchInfo> refine(List<VTMatchInfo> list) {

-		int topN;
 		Collections.sort(list, SCORE_COMPARATOR);

 		// take the top N + 1 (to catch duplicates across the N boundary)
-		topN = Math.min(TOP_N + 1, list.size());
+		int topN = Math.min(TOP_N + 1, list.size());
 		list = list.subList(0, topN);

 		// remove things that are "very equal"
 		if (list.size() > 1) {
 			double previousScore = list.get(0).getSimilarityScore().getScore();
 			int cutoffIndex = 1;
-			for (int ii = 1; ii < list.size(); ++ii) {
-				double currentScore = list.get(ii).getSimilarityScore().getScore();
+			for (int i = 1; i < list.size(); ++i) {
+				double currentScore = list.get(i).getSimilarityScore().getScore();
 				if (currentScore > previousScore - EQUALS_EPSILON) {
 					--cutoffIndex;
 					break;
@ -281,9 +273,9 @@ public abstract class VTAbstractReferenceProgramCorrelator extends VTAbstractPro
 		if (list.size() > 1) {
 			double bestScore = list.get(0).getSimilarityScore().getScore();
 			int cutoffIndex = list.size();
-			for (int ii = 1; ii < list.size(); ++ii) {
-				if (list.get(ii).getSimilarityScore().getScore() < bestScore - DIFFERENTIAL) {
-					cutoffIndex = ii;
+			for (int i = 1; i < list.size(); ++i) {
+				if (list.get(i).getSimilarityScore().getScore() < bestScore - DIFFERENTIAL) {
+					cutoffIndex = i;
 					break;
 				}
 			}
@ -293,297 +285,283 @@ public abstract class VTAbstractReferenceProgramCorrelator extends VTAbstractPro
 	}

 	/**
-	 * accumulateFunctionReferences recursively traces the reference chains from a given address 
-	 * and returns by reference a list of functions found along the reference chain.
+	 * Recursively traces the reference chains from a given address and returns by reference a
+	 * list of functions found along the reference chain.
 	 * 
-	 * @param depth - The initial recursion depth
-	 * @param list  - A function accumulation list that is updated by this function
-	 * @param refManager - {@inheritDoc ReferenceManager}
-	 * @param funManager - {@inheritDoc FunctionManager}
-	 * @param listing - The Program listing
-	 * @param address - An address represents a location in a program
+	 * @param depth the initial recursion depth
+	 * @param list a function accumulation list that is updated by this function
+	 * @param program the program
+	 * @param address an address represents a location in a program
 	 */
-	private void accumulateFunctionReferences(int depth, List<Function> list,
-			ReferenceManager refManager, FunctionManager funManager, Listing listing,
-			Address address) {
+	private void accumulateFunctionReferences(int depth, Set<Function> list,
+			Program program, Address address) {

-		// Do NOT proceed if the max recursion depth has been reached
 		if (depth >= MAX_DEPTH) {
 			return;
 		}
-		/* If address corresponds to a Thunk Function, in addition to following back references, 
-		 * you should collect back-thunk-addresses (not included in references) by using the 
-		 * method Function.getFunctionThunkAddresses (Elf programs can have thunks which do 
+
+		/*
+		 * If address corresponds to a Thunk Function, in addition to following back references,
+		 * you should collect back-thunk-addresses (not included in references) by using the
+		 * method Function.getFunctionThunkAddresses (Elf programs can have thunks which do
 		 * not have a forward reference but thunk another function).  You may also need to dedup
-		 *  your list of functions returned if this could cause fallout.  In addition, you may 
-		 *  need to watch out for recursion loops which could occur (i.e., a function pointer which 
-		 *  has a secondary reference to itself - contrived example). 		 * 
+		 * your list of functions returned if this could cause fallout.  In addition, you may
+		 * need to watch out for recursion loops which could occur (i.e., a function pointer which
+		 * has a secondary reference to itself - contrived example).
 		 */

-		// Check for Thunk Function
-		Function addressFunction = funManager.getFunctionAt(address);
+		FunctionManager functionManager = program.getFunctionManager();
+		Function addressFunction = functionManager.getFunctionAt(address);
 		if (addressFunction != null) {
 			Address[] thunkAddresses = addressFunction.getFunctionThunkAddresses();
 			if (thunkAddresses != null) {
 				for (Address thunkAddress : thunkAddresses) {
-					if (depth < MAX_DEPTH) {
-						accumulateFunctionReferences(depth + 1, list, refManager, funManager,
-							listing, thunkAddress);
-					}
+					accumulateFunctionReferences(depth + 1, list, program, thunkAddress);
 				}
 			}
 		}

 		// Handle References to the address
-		ReferenceIterator ii = refManager.getReferencesTo(address);
-		while (ii.hasNext()) {
-			Reference reference = ii.next();
+		if (address.isStackAddress() || address.isRegisterAddress()) {
+			return; // can't have references to these types of addresses
+		}
+
+		ReferenceManager refManager = program.getReferenceManager();
+		Listing listing = program.getListing();
+		ReferenceIterator it = refManager.getReferencesTo(address);
+		while (it.hasNext()) {
+			Reference reference = it.next();
 			Address fromAddress = reference.getFromAddress();
 			CodeUnit codeUnit = listing.getCodeUnitAt(fromAddress);
-			// if the code unit at the location of the reference is an Instruction, then get the function
-			// where the reference occurs and determine if it passes the basic VT function match test set above
-			// if so, add it to the function accumulation list for the original reference
-			if (codeUnit instanceof Instruction) {
-				Function function = funManager.getFunctionContaining(fromAddress);

-				if (function != null) {
-					if (!function.isThunk()) {
-						list.add(function);
-					}
-					else {
-						//If a thunk function recurse
-						accumulateFunctionReferences(depth + 1, list, refManager, funManager,
-							listing, function.getEntryPoint());
-					}
+			// if the code unit at the location of the reference is an Instruction, then get the
+			// function where the reference occurs and determine if it passes the basic VT function
+			// match test set above
+			if (codeUnit instanceof Instruction) {
+				Function function = functionManager.getFunctionContaining(fromAddress);
+				if (function == null) {
+					continue;
+				}
+
+				if (function.isThunk()) {
+					// also add references to the thunk function
+					Address entryPoint = function.getEntryPoint();
+					accumulateFunctionReferences(depth + 1, list, program, entryPoint);
 				}
 				else {
-					//Msg.warn(this, "no function for instruction at " + fromAddress +
-					//	" for reference " + address);
+					list.add(function);
 				}
 			}
 			else if (codeUnit instanceof Data) {
-				if (depth < MAX_DEPTH) {
-					accumulateFunctionReferences(depth + 1, list, refManager, funManager, listing,
-						fromAddress);
-				}
-			}
-			else {
-				//Msg.warn(this, "weird non-instruction non-data codeunit: " + codeUnit);
+				accumulateFunctionReferences(depth + 1, list, program, fromAddress);
 			}
 		}
 	}

 	/**
-	 * Boolean function used to check that a match association is of the correct type (e.g. DATA or FUNCTION) for the given correlator.
-	 * Called by extractReferenceFeatures. 
+	 * Used to check that a match association is of the correct type (e.g. DATA or FUNCTION) for
+	 * the given correlator.
 	 * 
-	 * @param matchAssocType the type of match.
-	 * @return True or False
+	 * @param associationType the type of match
+	 * @return true if the correct type
 	 */
-	protected abstract boolean isExpectedRefType(VTAssociationType matchAssocType);
-
-	protected abstract boolean isExpectedRefType(Reference myRef);
+	protected abstract boolean isExpectedRefType(VTAssociationType associationType);

 	/**
-	 * extractReferenceFeatures is the core of the reference algorithm.  Each accepted match becomes a unique feature.
-	 * At the end, all the source and destination functions will have "vectors" of these features, which
-	 * are unique match ids.  Then the LSH dictionary can be made from the source and we can look for matches   
-	 * in the destination.
+	 * Used to check that a match association is of the correct type (e.g. DATA or FUNCTION) for
+	 * the given correlator.
 	 * 
-	 * @param matchSet The VTMatchSet of previously user-accepted matches.
-	 * @param monitor TaskMonitor
+	 * @param ref the reference
+	 * @return true if the correct type
 	 */
-	protected void extractReferenceFeatures(VTMatchSet matchSet, TaskMonitor monitor) {
+	protected abstract boolean isExpectedRefType(Reference ref);

-		// Make source and destination maps that will be populated here.
-		srcFuncAddresstoVectorMap = new HashMap<>();
-		destFuncAddresstoVectorMap = new HashMap<>();
+	/**
+	 * extractReferenceFeatures is the core of the reference algorithm.  Each accepted match
+	 * becomes a unique feature. At the end, all the source and destination functions will have
+	 * "vectors" of these features, which are unique match ids.  Then the LSH dictionary can be
+	 * made from the source and we can look for matches in the destination.
+	 * 
+	 * @param matchSet the match set of previously user-accepted matches
+	 * @param monitor the monitor
+	 */
+	private void extractReferenceFeatures(VTMatchSet matchSet, TaskMonitor monitor)
+			throws CancelledException {
+
+		srcVectorsByAddress =
+			LazyMap.lazyMap(new HashMap<>(), addr -> new LSHCosineVectorAccum());
+		destVectorsByAddress =
+			LazyMap.lazyMap(new HashMap<>(), addr -> new LSHCosineVectorAccum());

-		// Get function managers for Source and Destination Programs
 		FunctionManager srcFuncManager = sourceProgram.getFunctionManager();
 		FunctionManager destFuncManager = destinationProgram.getFunctionManager();
-
-		// get total function counts for computing probabilities 
 		int srcFunctionCount = srcFuncManager.getFunctionCount();
 		int destFunctionCount = destFuncManager.getFunctionCount();

-		// setup session
-		final VTSession session = matchSet.getSession();
-		int total = 0;
-		HashMap<String, VTMatchSet> dedupedMatchSets = new HashMap<>();
-		for (VTMatchSet ms : session.getMatchSets()) {
-			String name = ms.getProgramCorrelatorInfo().getName();
-			if (name.equals(correlatorName) ||
-				(dedupedMatchSets.containsKey(name) && ms.getID() < dedupedMatchSets.get(name).getID())) {
-				continue;
-			}
-			dedupedMatchSets.put(name, ms);
+		Counter totalMatches = new Counter();
+		Collection<VTMatchSet> matchSets = getMatchSets(matchSet.getSession(), totalMatches);
+		monitor.initialize(totalMatches.count);

-			// get total number of matches in matchSets List
-			total += ms.getMatchCount();
-		}
-
-		final Collection<VTMatchSet> matchSets = dedupedMatchSets.values();
-		monitor.initialize(total);
-
-		/**
-		 * Loop through the matchSets in order to get total source and dest reference counts that pass the filter.
-		 * Only add matches that pass the isExpectedRefType filter test to the hash tables.
-		 */
-
-		Map<VTMatch, ArrayList<Function>> sourceRefMap = new HashMap<>();
-		Map<VTMatch, ArrayList<Function>> destinationRefMap = new HashMap<>();
+		// Loop through the matchSets in order to get total source and destination reference
+		// counts that pass the filter
+		Map<VTMatch, Set<Function>> sourceRefMap = new HashMap<>();
+		Map<VTMatch, Set<Function>> destinationRefMap = new HashMap<>();

 		for (VTMatchSet ms : matchSets) {
-			final Collection<VTMatch> matches = ms.getMatches();
+			Collection<VTMatch> matches = ms.getMatches();
 			for (VTMatch match : matches) {
-				// update monitor
-				if (monitor.isCancelled()) {
-					return;
-				}
+
+				monitor.checkCanceled();
 				monitor.incrementProgress(1);

-				//check match association type and status
-				final VTAssociation association = match.getAssociation();
-				final Address sourceAddress = association.getSourceAddress();
-				final Address destinationAddress = association.getDestinationAddress();
-
-				if (isExpectedRefType(association.getType()) &&
-					association.getStatus() == VTAssociationStatus.ACCEPTED) {
-
-					// populate sourceReferences by passing it to accumulateFunctionReferences
-					ArrayList<Function> sourceReferences = new ArrayList<>();
-					accumulateFunctionReferences(0, sourceReferences,
-						sourceProgram.getReferenceManager(), srcFuncManager, sourceListing,
-						sourceAddress);
-
-					ArrayList<Function> destinationReferences = new ArrayList<>();
-					accumulateFunctionReferences(0, destinationReferences,
-						destinationProgram.getReferenceManager(), destFuncManager,
-						destinationListing, destinationAddress);
-
-					final int sourceReferenceCountTo = sourceReferences.size();
-					final int destinationReferenceCountTo = destinationReferences.size();
-
-					//If either of the reference lists is empty, skip adding them to the hashtable
-					if (sourceReferenceCountTo == 0 || destinationReferenceCountTo == 0) {
-						continue;
-					}
-
-					// Fill Hashtable for use in next loop
-					sourceRefMap.put(match, sourceReferences);
-					destinationRefMap.put(match, destinationReferences);
-				}
+				accumulateMatchFunctionReferences(sourceRefMap, destinationRefMap, match);
 			}
 		}
+
 		monitor.setMessage("Adding ACCEPTED matches to feature vectors.");
 		int featureID = 1;
-		// for each match that passed the filter above, score it
+
+		// score each match that passed the filter above
 		for (VTMatch match : sourceRefMap.keySet()) {
-			// update monitor
-			if (monitor.isCancelled()) {
-				return;
-			}
+
+			monitor.checkCanceled();
 			monitor.incrementProgress(1);

-			// If the match is in one Hashtable it will be in the other by the joint construction above
-			if (sourceRefMap.get(match) != null) {
+			if (sourceRefMap.get(match).isEmpty()) {
+				continue;
+			}

-				/**
-				 * Compute raw percentages for the sources and destination counts 
-				 * as ratios 
-				 * (total references to the match):(total number of references of the correct type)
-				 */
+			/**
+			 * Compute raw percentages for the sources and destination counts as ratios
+			 * (total references to the match):(total number of references of the correct type)
+			 */

-				// Compute entropy of the system for the given match
-				Set<Function> srcRefFuncs = new HashSet<>(sourceRefMap.get(match));
-				Set<Function> destRefFuncs = new HashSet<>(destinationRefMap.get(match));
+			// Compute entropy of the system for the given match
+			Set<Function> srcRefFuncs = new HashSet<>(sourceRefMap.get(match));
+			Set<Function> destRefFuncs = new HashSet<>(destinationRefMap.get(match));

-				// take the average probability that the feature appears any one function (in either source or dest)
-				double altPraw = (double) (srcRefFuncs.size() + destRefFuncs.size()) /
-					(srcFunctionCount + destFunctionCount);
-				final double weight = Math.sqrt(-Math.log(altPraw));
+			// take the average probability that the feature appears in any one function (in either
+			// source or dest)
+			double altPraw = (double) (srcRefFuncs.size() + destRefFuncs.size()) /
+				(srcFunctionCount + destFunctionCount);
+			double weight = Math.sqrt(-Math.log(altPraw));

-				// By the construction above, there may be duplicate functions in the RefMaps
-				for (Function function : sourceRefMap.get(match)) {
-					//If function is not in the HashMap, add it
-					LSHCosineVectorAccum vector = srcFuncAddresstoVectorMap.get(function.getEntryPoint());
-					if (vector == null) {
-						vector = new LSHCosineVectorAccum();
-						srcFuncAddresstoVectorMap.put(function.getEntryPoint(), vector);
-					}
-					vector.addHash(featureID, weight);
-				}
+			// By the construction above, there may be duplicate functions in the RefMaps
+			for (Function function : sourceRefMap.get(match)) {
+				LSHCosineVectorAccum vector =
+					srcVectorsByAddress.get(function.getEntryPoint());
+				vector.addHash(featureID, weight);
+			}

-				for (Function function : destinationRefMap.get(match)) {
-					LSHCosineVectorAccum vector = destFuncAddresstoVectorMap.get(function.getEntryPoint());
-					if (vector == null) {
-						vector = new LSHCosineVectorAccum();
-						destFuncAddresstoVectorMap.put(function.getEntryPoint(), vector);
-					}
-					vector.addHash(featureID, weight);
-				}
-				++featureID;
-			} //end if match association 
+			for (Function function : destinationRefMap.get(match)) {
+				LSHCosineVectorAccum vector =
+					destVectorsByAddress.get(function.getEntryPoint());
+				vector.addHash(featureID, weight);
+			}
+
+			++featureID;
 		}

-		/* At this point the vectors in the sourceMap and the destinationMap contain log weights for 
+		updateSourceAndDestinationVectors(featureID, srcFuncManager, destFuncManager, monitor);
+	}
+
+	private Collection<VTMatchSet> getMatchSets(VTSession session, Counter totalMatches) {
+
+		Map<String, VTMatchSet> dedupedMatchSets = new HashMap<>();
+		for (VTMatchSet ms : session.getMatchSets()) {
+			String name = ms.getProgramCorrelatorInfo().getName();
+
+			// odd checks here: 1) assuming we do not want to include our own results when checking
+			// matches; 2) why keep only the newest match set data?  seems like we should take all
+			// matches and dedup the matches, not the match sets
+			if (name.equals(correlatorName) ||
+				(dedupedMatchSets.containsKey(name) &&
+					ms.getID() < dedupedMatchSets.get(name).getID())) {
+				continue;
+			}
+
+			dedupedMatchSets.put(name, ms);
+			totalMatches.count += ms.getMatchCount();
+		}
+
+		return dedupedMatchSets.values();
+
+	}
+
+	private void accumulateMatchFunctionReferences(
+			Map<VTMatch, Set<Function>> sourceRefMap,
+			Map<VTMatch, Set<Function>> destinationRefMap, VTMatch match) {
+
+		// check match association type and status
+		VTAssociation association = match.getAssociation();
+		Address sourceAddress = association.getSourceAddress();
+		Address destinationAddress = association.getDestinationAddress();
+
+		if (!isExpectedRefType(association.getType())) {
+			return;
+		}
+
+		if (association.getStatus() != VTAssociationStatus.ACCEPTED) {
+			return;
+		}
+
+		Set<Function> sourceReferences = new HashSet<>();
+		accumulateFunctionReferences(0, sourceReferences, sourceProgram, sourceAddress);
+
+		// If either of the reference lists is empty, skip adding them to the map
+		if (sourceReferences.isEmpty()) {
+			return;
+		}
+
+		Set<Function> destinationReferences = new HashSet<>();
+		accumulateFunctionReferences(0, destinationReferences, destinationProgram,
+			destinationAddress);
+
+		// If either of the reference lists is empty, skip adding them to the map
+		if (destinationReferences.isEmpty()) {
+			return;
+		}
+
+		// Fill Hashtable for use in next loop
+		sourceRefMap.put(match, sourceReferences);
+		destinationRefMap.put(match, destinationReferences);
+	}
+
+	private void updateSourceAndDestinationVectors(int featureID, FunctionManager srcFuncManager,
+			FunctionManager destFuncManager, TaskMonitor monitor) {
+
+		/*
+		 * At this point the vectors in the sourceMap and the destinationMap contain log weights for
 		 * the probability that ACCEPTED MATCHED features appear in any one function in the system.
 		 * Each map has the key:value pair = refFunction:featureVector.
-		 * In order to account unmatched/unaccepted matches that appear in the key set that consists of 
-		 * possibly correlated functions, we can consider the cost of a reference switching
-		 * and the cost of a reference being dropped or picked up between versions.
+		 * In order to account unmatched/unaccepted matches that appear in the key set that
+		 * consists of possibly correlated functions, we can consider the cost of a reference
+		 * switching and the cost of a reference being dropped or picked up between versions.
 		 * 
-		 * Theoretically this should be dependent on the probability of the referenced element occurring, 
-		 * but for the moment we'll consider the model for a generalized switch and drop/pickup. 
+		 * Theoretically this should be dependent on the probability of the referenced element
+		 * occurring, but for the moment we'll consider the model for a generalized switch and
+		 * drop/pickup.
 		 */
 		monitor.setMessage("Adding unmatched references to feature vectors.");

 		double pSwitch = 0.5;
 		double uniqueWeight = Math.sqrt(-Math.log(pSwitch)); //arbitrary weight used to provide negative correlation

-		/*
-		 * Update Source Vectors
-		 */
-		for (Address addr : srcFuncAddresstoVectorMap.keySet()) {
-			Function func = srcFuncManager.getFunctionAt(addr);
+		for (Address addr : srcVectorsByAddress.keySet()) {

-			CodeUnitIterator iter = sourceProgram.getListing().getCodeUnits(func.getBody(), true);
-			int totalRefs = 0;
-			while (iter.hasNext()) {
-				CodeUnit cu = iter.next();
-				Reference[] memRefs = cu.getReferencesFrom();
-
-				for (Reference memRef : memRefs) {
-					if (isExpectedRefType(memRef)) {
-						++totalRefs;
-					}
-				}
-			}
-			LSHCosineVectorAccum srcVector = srcFuncAddresstoVectorMap.get(addr);
+			int totalRefs = countFunctionRefs(sourceProgram, addr);
+			LSHCosineVectorAccum srcVector = srcVectorsByAddress.get(addr);
 			int numEntries = srcVector.numEntries();
 			for (int i = 0; i < (totalRefs - numEntries); i++) {
 				srcVector.addHash(featureID, uniqueWeight);
 				++featureID;
 			}
 		}
-		/*
-		 * Update Destination Vectors
-		 */
-		for (Address addr : destFuncAddresstoVectorMap.keySet()) {
-			Function func = destFuncManager.getFunctionAt(addr);
-			CodeUnitIterator iter = destinationListing.getCodeUnits(func.getBody(), true);
-			int totalRefs = 0;
-			while (iter.hasNext()) {
-				CodeUnit cu = iter.next();
-				Reference[] memRefs = cu.getReferencesFrom();
-				for (Reference memRef : memRefs) {
-					if (isExpectedRefType(memRef)) {
-						++totalRefs;
-					}
-				}
-			}
-			LSHCosineVectorAccum dstVector = destFuncAddresstoVectorMap.get(addr);
+
+		for (Address addr : destVectorsByAddress.keySet()) {
+
+			int totalRefs = countFunctionRefs(destinationProgram, addr);
+			LSHCosineVectorAccum dstVector = destVectorsByAddress.get(addr);
 			int numEntries = dstVector.numEntries();
 			for (int i = 0; i < (totalRefs - numEntries); i++) {
 				dstVector.addHash(featureID, uniqueWeight);
@ -591,4 +569,21 @@ public abstract class VTAbstractReferenceProgramCorrelator extends VTAbstractPro
 			}
 		}
 	}
+
+	private int countFunctionRefs(Program program, Address addr) {
+		Function f = program.getFunctionManager().getFunctionAt(addr);
+		CodeUnitIterator it = program.getListing().getCodeUnits(f.getBody(), true);
+		int totalRefs = 0;
+		while (it.hasNext()) {
+			CodeUnit cu = it.next();
+			Reference[] memRefs = cu.getReferencesFrom();
+
+			for (Reference memRef : memRefs) {
+				if (isExpectedRefType(memRef)) {
+					++totalRefs;
+				}
+			}
+		}
+		return totalRefs;
+	}
 }
--- a/Ghidra/Framework/Generic/src/main/java/generic/DominantPair.java
+++ b/Ghidra/Framework/Generic/src/main/java/generic/DominantPair.java
@ -15,16 +15,17 @@
 */
 package generic;

+import java.util.Objects;
+
 import generic.stl.Pair;

 /**
- * DominantPair is a pair where the key is responsible for equality and
- * hashCode (and the value of the pair doesn't matter at all).  This is
- * useful when you need the pair itself to function as a key in a Map or
- * value in a Set.
+ * DominantPair is a pair where the key is responsible for equality and hashCode (and the value of
+ * the pair doesn't matter at all).  This is useful when you need the pair itself to function as a
+ * key in a Map or value in a Set.
 *
- * @param <K>
- * @param <V>
+ * @param <K> the key type
+ * @param <V> the value type
 */
 public class DominantPair<K, V> extends Pair<K, V> {
 	public DominantPair(K key, V value) {
@ -51,15 +52,7 @@ public class DominantPair<K, V> extends Pair<K, V> {
 			return false;
 		}
 		DominantPair<?, ?> other = (DominantPair<?, ?>) obj;
-		if (first == null) {
-			if (other.first != null) {
-				return false;
-			}
-		}
-		else if (!first.equals(other.first)) {
-			return false;
-		}
-		return true;
+		return Objects.equals(first, other.first);
 	}

 	@Override