Added frequent subcircuit miner to subcircuit library

This commit is contained in:
Clifford Wolf 2013-03-02 13:53:59 +01:00
parent a338d1a082
commit 84cdfa55fc
7 changed files with 368 additions and 13 deletions

2
libs/subcircuit/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
demo
scshell

View File

@ -39,6 +39,7 @@ scshell: scshell.o subcircuit.o
test: scshell
./scshell < test_macc22.txt
./scshell < test_mine.txt
perl test_perm.pl | ./scshell
splrun test_shorts.spl | ./scshell
splrun test_large.spl | ./scshell

View File

@ -14,20 +14,12 @@ Introduction
This is a library that implements a modified Ullmann Subgraph Isomorphism
Algorithm with additional features aimed at working with coarse grain logic
networks.
networks. It also contains a simple frequent subcircuit mining algorithm.
A simple command line tool that exposes the features of the library is also
included.
Under-Construction Warning
--------------------------
This work is under constructions. It is likely that they are bugs in the
library that need fixing. Feel free to contact me at clifford@clifford.at
if you have found a bug.
C++11 Warning
-------------
@ -97,6 +89,9 @@ Algorithm are provided by the library.
* Support for finding only non-overlapping matches.
* A simple miner for frequent subcircuts that operates on the same circuit
description format.
* The public API of the library is using std::string identifiers for
nodes, node types and ports. Internally the costly part of the
algorithm is only using integer values, thus speeding up the
@ -328,6 +323,32 @@ bool userCheckSolution(result):
ignored. The default implementation always returns true.
Mining for frequent SubCircuits
-------------------------------
The solver also contains a miner for frequent subcircuits. The following code
fragment will find all frequent subcircuits with at least minNodes nodes and
at most maxNodes nodes that occurs at least minMatches times:
std::vector<SubCircuit::Solver::MineResult> results;
mySolver.mine(results, minNodes, maxNodes, minMatches);
The miner works by finding frequent pairs of nodes and then combining them
to larger subcircuits. Because of this incremental strategy the miner only
works as expected on graphs with markAllExtern() set.
The mine() method has an optional fifth parameter that limits the number
of matches counted in one graph. This can be useful when mining for circuits
that are found in at least a number of graphs. E.g. the following call
would find all subcircuits with 5 nodes that are found in at least 7 of
the registered graphs:
mySolver.mine(results, 5, 5, 7, 1);
Note that this miner is not very efficient and therefore its use is not
recommended for large circuits.
Debugging
---------
@ -420,6 +441,10 @@ The following commands can be used in scshell outside a graph ... endgraph block
Call Solver::solve(). The <allow_overlap> must be "1" or "true"
for true and "0" or "false" for false.
mine <min_nodes> <max_nodes> <min_matches> [<limit_matches_per_graph>]
Call Solver::mine().
expect <number>
Print all results so far since the last call to expect. Expect

View File

@ -26,6 +26,7 @@ int main()
SubCircuit::Solver solver;
std::map<std::string, std::set<std::string>> initialMappings;
std::vector<SubCircuit::Solver::Result> results;
std::vector<SubCircuit::Solver::MineResult> mineResults;
std::vector<std::string> cmdBuffer;
bool lastCommandExpect = false;
@ -162,6 +163,12 @@ int main()
continue;
}
if (cmdBuffer[0] == "mine" && 4 <= cmdBuffer.size() && cmdBuffer.size() <= 5) {
solver.mine(mineResults, atoi(cmdBuffer[1].c_str()), atoi(cmdBuffer[2].c_str()),
atoi(cmdBuffer[3].c_str()), cmdBuffer.size() == 5 ? atoi(cmdBuffer[4].c_str()) : -1);
continue;
}
if (cmdBuffer[0] == "clearoverlap" && cmdBuffer.size() == 1) {
solver.clearOverlapHistory();
continue;
@ -179,7 +186,7 @@ int main()
if (cmdBuffer[0] == "expect" && cmdBuffer.size() == 2) {
int expected = atoi(cmdBuffer[1].c_str());
printf("\n-- Expected %d, Got %d --\n", expected, int(results.size()));
printf("\n-- Expected %d, Got %d --\n", expected, int(results.size()) + int(mineResults.size()));
for (int i = 0; i < int(results.size()); i++) {
printf("\nMatch #%d: (%s in %s)\n", i, results[i].needleGraphId.c_str(), results[i].haystackGraphId.c_str());
for (const auto &it : results[i].mappings) {
@ -189,9 +196,18 @@ int main()
printf("\n");
}
}
for (auto &result : mineResults) {
printf("\nFrequent SubCircuit with %d nodes and %d matches:\n", int(result.nodes.size()), result.totalMatchesAfterLimits);
printf(" primary match in %s:", result.graphId.c_str());
for (auto &node : result.nodes)
printf(" %s", node.nodeId.c_str());
printf("\n");
for (auto &it : result.matchesPerGraph)
printf(" matches in %s: %d\n", it.first.c_str(), it.second);
}
printf("\n");
if (expected != int(results.size())) {
printf("^^ expected %d, Got %d ^^\n\n", expected, int(results.size()));
if (expected != int(results.size()) + int(mineResults.size())) {
printf("^^ expected %d, Got %d ^^\n\n", expected, int(results.size()) + int(mineResults.size()));
printf(" +----------------+\n");
printf(" | \\|/ ____ \\|/ |\n");
printf(" | \"@'/ ,. \\`@\" |\n");
@ -202,6 +218,7 @@ int main()
return 1;
}
results.clear();
mineResults.clear();
lastCommandExpect = true;
continue;
}
@ -215,7 +232,7 @@ int main()
delete graph;
if (!lastCommandExpect) {
printf("\n-- Got %d --\n", int(results.size()));
printf("\n-- Got %d --\n", int(results.size()) + int(mineResults.size()));
for (int i = 0; i < int(results.size()); i++) {
printf("\nMatch #%d: (%s in %s)\n", i, results[i].needleGraphId.c_str(), results[i].haystackGraphId.c_str());
for (const auto &it : results[i].mappings) {
@ -225,6 +242,15 @@ int main()
printf("\n");
}
}
for (auto &result : mineResults) {
printf("\nFrequent SubCircuit with %d nodes and %d matches:\n", int(result.nodes.size()), result.totalMatchesAfterLimits);
printf(" primary match in %s:", result.graphId.c_str());
for (auto &node : result.nodes)
printf(" %s", node.nodeId.c_str());
printf("\n");
for (auto &it : result.matchesPerGraph)
printf(" matches in %s: %d\n", it.first.c_str(), it.second);
}
} else
printf("PASSED.\n");

View File

@ -46,6 +46,42 @@ static std::string stringf(const char *fmt, ...)
return string;
}
SubCircuit::Graph::Graph(const Graph &other, const std::vector<std::string> &otherNodes)
{
allExtern = other.allExtern;
std::map<int, int> other2this;
for (int i = 0; i < int(otherNodes.size()); i++) {
assert(other.nodeMap.count(otherNodes[i]) > 0);
other2this[other.nodeMap.at(otherNodes[i])] = i;
nodeMap[otherNodes[i]] = i;
}
std::map<int, int> edges2this;
for (auto &i1 : other2this)
for (auto &i2 : other.nodes[i1.first].ports)
for (auto &i3 : i2.bits)
if (edges2this.count(i3.edgeIdx) == 0)
edges2this[i3.edgeIdx] = edges2this.size();
edges.resize(edges2this.size());
for (auto &it : edges2this) {
for (auto &bit : other.edges[it.first].portBits)
if (other2this.count(bit.nodeIdx) > 0)
edges[it.second].portBits.insert(BitRef(other2this[bit.nodeIdx], bit.portIdx, bit.bitIdx));
edges[it.second].constValue = other.edges[it.first].constValue;
edges[it.second].isExtern = other.edges[it.first].isExtern;
}
nodes.resize(other2this.size());
for (auto &it : other2this) {
nodes[it.second] = other.nodes[it.first];
for (auto &i2 : nodes[it.second].ports)
for (auto &i3 : i2.bits)
i3.edgeIdx = edges2this.at(i3.edgeIdx);
}
}
bool SubCircuit::Graph::BitRef::operator < (const BitRef &other) const
{
if (nodeIdx != other.nodeIdx)
@ -1072,6 +1108,197 @@ class SubCircuit::SolverWorker
}
}
// additional data structes and functions for mining
struct NodeSet {
std::string graphId;
std::set<int> nodes;
NodeSet(std::string graphId, int node1, int node2) {
this->graphId = graphId;
nodes.insert(node1);
nodes.insert(node2);
}
NodeSet(std::string graphId, const std::vector<int> &nodes) {
this->graphId = graphId;
for (int node : nodes)
this->nodes.insert(node);
}
void extend(const NodeSet &other) {
assert(this->graphId == other.graphId);
for (int node : other.nodes)
nodes.insert(node);
}
int extendCandidate(const NodeSet &other) const {
if (graphId != other.graphId)
return 0;
int newNodes = 0;
bool intersect = false;
for (int node : other.nodes)
if (nodes.count(node) > 0)
intersect = true;
else
newNodes++;
return intersect ? newNodes : 0;
}
bool operator <(const NodeSet &other) const {
if (graphId != other.graphId)
return graphId < other.graphId;
return nodes < other.nodes;
}
};
void solveForMining(std::vector<Solver::Result> &results, const GraphData &needle)
{
bool backupVerbose = verbose;
verbose = false;
for (auto &it : graphData)
{
GraphData &haystack = it.second;
assert(haystack.graph.allExtern);
std::vector<std::set<int>> enumerationMatrix;
std::map<std::string, std::set<std::string>> initialMappings;
generateEnumerationMatrix(enumerationMatrix, needle, haystack, initialMappings);
haystack.usedNodes.resize(haystack.graph.nodes.size());
ullmannRecursion(results, enumerationMatrix, 0, needle, haystack, true, -1);
}
verbose = backupVerbose;
}
int testForMining(std::vector<Solver::MineResult> &results, std::set<NodeSet> &usedSets, std::vector<std::set<NodeSet>> &nextPool, NodeSet &testSet,
const std::string &graphId, const Graph &graph, int minNodes, int minMatches, int limitMatchesPerGraph)
{
GraphData needle;
std::vector<std::string> needle_nodes;
for (int nodeIdx : testSet.nodes)
needle_nodes.push_back(graph.nodes[nodeIdx].nodeId);
needle.graph = Graph(graph, needle_nodes);
diCache.add(needle.graph, needle.adjMatrix, graphId, userSolver);
std::vector<Solver::Result> ullmannResults;
solveForMining(ullmannResults, needle);
int matches = 0;
std::map<std::string, int> matchesPerGraph;
std::set<NodeSet> thisNodeSetSet;
for (auto &it : ullmannResults)
{
std::vector<int> resultNodes;
for (auto &i2 : it.mappings)
resultNodes.push_back(graphData[it.haystackGraphId].graph.nodeMap[i2.second.haystackNodeId]);
NodeSet resultSet(it.haystackGraphId, resultNodes);
if (usedSets.count(resultSet) > 0) {
assert(thisNodeSetSet.count(resultSet) > 0);
continue;
}
usedSets.insert(resultSet);
thisNodeSetSet.insert(resultSet);
matchesPerGraph[it.haystackGraphId]++;
if (limitMatchesPerGraph < 0 || matchesPerGraph[it.haystackGraphId] < limitMatchesPerGraph)
matches++;
}
if (matches < minMatches)
return 0;
if (minNodes <= int(testSet.nodes.size()))
{
Solver::MineResult result;
result.graphId = graphId;
result.totalMatchesAfterLimits = matches;
result.matchesPerGraph = matchesPerGraph;
for (int nodeIdx : testSet.nodes) {
Solver::MineResultNode resultNode;
resultNode.nodeId = graph.nodes[nodeIdx].nodeId;
resultNode.userData = graph.nodes[nodeIdx].userData;
result.nodes.push_back(resultNode);
}
results.push_back(result);
}
nextPool.push_back(thisNodeSetSet);
return matches;
}
void findNodePairs(std::vector<Solver::MineResult> &results, std::vector<std::set<NodeSet>> &nodePairs, int minNodes, int minMatches, int limitMatchesPerGraph)
{
std::set<NodeSet> usedPairs;
if (verbose)
printf("\nFind frequent node pairs:\n");
for (auto &graph_it : graphData)
for (int node1 = 0; node1 < int(graph_it.second.graph.nodes.size()); node1++)
for (auto &adj_it : graph_it.second.adjMatrix.at(node1))
{
const std::string &graphId = graph_it.first;
const auto &graph = graph_it.second.graph;
int node2 = adj_it.first;
NodeSet pair(graphId, node1, node2);
if (usedPairs.count(pair) > 0)
continue;
int matches = testForMining(results, usedPairs, nodePairs, pair, graphId, graph, minNodes, minMatches, limitMatchesPerGraph);
if (verbose && matches > 0)
printf("Pair %s[%s,%s] -> %d\n", graphId.c_str(), graph.nodes[node1].nodeId.c_str(),
graph.nodes[node2].nodeId.c_str(), matches);
}
}
void findNextPool(std::vector<Solver::MineResult> &results, std::vector<std::set<NodeSet>> &pool,
int oldSetSize, int increment, int minNodes, int minMatches, int limitMatchesPerGraph)
{
std::vector<std::set<NodeSet>> nextPool;
std::map<std::string, std::vector<const NodeSet*>> poolPerGraph;
for (auto &i1 : pool)
for (auto &i2 : i1)
poolPerGraph[i2.graphId].push_back(&i2);
if (verbose)
printf("\nFind frequent subcircuits of size %d using increment %d:\n", oldSetSize+increment, increment);
std::set<NodeSet> usedSets;
for (auto &it : poolPerGraph)
for (int idx1 = 0; idx1 < int(it.second.size()); idx1++)
for (int idx2 = idx1; idx2 < int(it.second.size()); idx2++)
{
if (it.second[idx1]->extendCandidate(*it.second[idx2]) != increment)
continue;
NodeSet mergedSet = *it.second[idx1];
mergedSet.extend(*it.second[idx2]);
if (usedSets.count(mergedSet) > 0)
continue;
const std::string &graphId = it.first;
const auto &graph = graphData[it.first].graph;
int matches = testForMining(results, usedSets, nextPool, mergedSet, graphId, graph, minNodes, minMatches, limitMatchesPerGraph);
if (verbose) {
printf("Set %s[", graphId.c_str());
bool first = true;
for (int nodeIdx : mergedSet.nodes) {
printf("%s%s", first ? "" : ",", graph.nodes[nodeIdx].nodeId.c_str());
first = false;
}
printf("] -> %d\n", matches);
}
}
pool.swap(nextPool);
}
// interface to the public Solver class
protected:
@ -1151,6 +1378,25 @@ protected:
ullmannRecursion(results, enumerationMatrix, 0, needle, haystack, allowOverlap, maxSolutions > 0 ? results.size() + maxSolutions : -1);
}
void mine(std::vector<Solver::MineResult> &results, int minNodes, int maxNodes, int minMatches, int limitMatchesPerGraph)
{
int nodeSetSize = 2;
std::vector<std::set<NodeSet>> pool;
findNodePairs(results, pool, minNodes, minMatches, limitMatchesPerGraph);
while (nodeSetSize < maxNodes)
{
int increment = nodeSetSize - 1;
if (nodeSetSize + increment >= minNodes)
increment = minNodes - nodeSetSize;
if (nodeSetSize >= minNodes)
increment = 1;
findNextPool(results, pool, nodeSetSize, increment, minNodes, minMatches, limitMatchesPerGraph);
nodeSetSize += increment;
}
}
void clearOverlapHistory()
{
for (auto &it : graphData)
@ -1252,6 +1498,11 @@ void SubCircuit::Solver::solve(std::vector<Result> &results, std::string needleG
worker->solve(results, needleGraphId, haystackGraphId, initialMappings, allowOverlap, maxSolutions);
}
void SubCircuit::Solver::mine(std::vector<MineResult> &results, int minNodes, int maxNodes, int minMatches, int limitMatchesPerGraph)
{
worker->mine(results, minNodes, maxNodes, minMatches, limitMatchesPerGraph);
}
void SubCircuit::Solver::clearOverlapHistory()
{
worker->clearOverlapHistory();

View File

@ -73,6 +73,7 @@ namespace SubCircuit
public:
Graph() : allExtern(false) { };
Graph(const Graph &other, const std::vector<std::string> &otherNodes);
void createNode(std::string nodeId, std::string typeId, void *userData = NULL);
void createPort(std::string nodeId, std::string portId, int width = 1, int minWidth = -1);
@ -100,6 +101,17 @@ namespace SubCircuit
std::map<std::string, ResultNodeMapping> mappings;
};
struct MineResultNode {
std::string nodeId;
void *userData;
};
struct MineResult {
std::string graphId;
int totalMatchesAfterLimits;
std::map<std::string, int> matchesPerGraph;
std::vector<MineResultNode> nodes;
};
private:
SolverWorker *worker;
@ -131,6 +143,9 @@ namespace SubCircuit
void solve(std::vector<Result> &results, std::string needleGraphId, std::string haystackGraphId, bool allowOverlap = true, int maxSolutions = -1);
void solve(std::vector<Result> &results, std::string needleGraphId, std::string haystackGraphId,
const std::map<std::string, std::set<std::string>> &initialMapping, bool allowOverlap = true, int maxSolutions = -1);
void mine(std::vector<MineResult> &results, int minNodes, int maxNodes, int minMatches, int limitMatchesPerGraph = -1);
void clearOverlapHistory();
void clearConfig();
};

View File

@ -0,0 +1,35 @@
# verbose
graph macc22
node mul_1 mul A 32 B 32 Y 32
node mul_2 mul A 32 B 32 Y 32
node add_1 add A 32 B 32 Y 32
connect mul_1 Y add_1 A
connect mul_2 Y add_1 B
allextern
endgraph
graph macc4x2
node mul_1 mul A 32 B 32 Y 32
node mul_2 mul A 32 B 32 Y 32
node mul_3 mul A 32 B 32 Y 32
node mul_4 mul A 32 B 32 Y 32
node add_1 add A 32 B 32 Y 32
node add_2 add A 32 B 32 Y 32
node add_3 add A 32 B 32 Y 32
connect mul_1 Y add_1 A
connect mul_2 Y add_1 B
connect mul_3 Y add_2 A
connect mul_4 Y add_2 B
connect add_1 Y add_3 A
connect add_2 Y add_3 B
allextern
endgraph
swapgroup mul A B
swapgroup add A B
mine 2 10 2
expect 5