File size: 4,718 Bytes
0d83ec0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 |
"""
Needle in a haystack search
Original source code is located here:
https://github.com/agapow/py-gsp/blob/master/gsp/motifsearch.py
"""
"""
A modifiable GSP algorithm.
"""
__version__ = '0.1'
### IMPORTS
### CONSTANTS & DEFINES
PP_INDENT = 3
### CODE ###
class GspSearch (object):
"""
A generic GSP algorithm, alllowing the individual parts to be overridden.
This is setup so the object can be created once, but searched multiple times
at different thresholds. In this generic form, we assume that the transactions
are simply strings.
"""
def __init__ (self, raw_transactions):
"""
C'tor, simply shaping the raw transactions into a useful form.
"""
self.process_transactions (raw_transactions)
def process_transactions (self, raw_transactions):
"""
Create the alphabet & (normalized) transactions.
"""
self.transactions = []
alpha = {}
for r in raw_transactions:
for c in r:
alpha[c] = True
self.transactions.append (r)
self.alpha = alpha.keys()
def generate_init_candidates (self):
"""
Make the initial set of candidate.
Usually this would just be the alphabet.
"""
return list (self.alpha)
def generate_new_candidates (self, freq_pat):
"""
Given existing patterns, generate a set of new patterns, one longer.
"""
old_cnt = len (freq_pat)
old_len = len (freq_pat[0])
print ("Generating new candidates from %s %s-mers ..." % (old_cnt, old_len))
new_candidates = []
for c in freq_pat:
for d in freq_pat:
merged_candidate = self.merge_candidates (c, d)
if merged_candidate and (merged_candidate not in new_candidates):
new_candidates.append (merged_candidate)
## Postconditions & return:
return new_candidates
def merge_candidates (self, a, b):
if a[1:] == b[:-1]:
return a + b[-1:]
else:
return None
def filter_candidates (self, trans_min):
"""
Return a list of the candidates that occur in at least the given number of transactions.
"""
filtered_candidates = []
for c in self.candidates:
curr_cand_hits = self.single_candidate_freq (c)
if trans_min <= curr_cand_hits:
filtered_candidates.append ((c, curr_cand_hits))
return filtered_candidates
def single_candidate_freq (self, c):
"""
Return true if a candidate is found in the transactions.
"""
hits = 0
for t in self.transactions:
if self.search_transaction (t, c):
hits += 1
return hits
def search_transaction (self, t, c):
"""
Does this candidate appear in this transaction?
"""
return (t.find (c) != -1)
def search (self, threshold):
## Preparation:
assert (0.0 < threshold) and (threshold <= 1.0)
trans_cnt = len (self.transactions)
trans_min = trans_cnt * threshold
print ("The number of transactions is: %s" % trans_cnt)
print ("The minimal support is: %s" % threshold)
print ("The minimal transaction support is: %s" % trans_min)
## Main:
# generate initial candidates & do initial filter
self.candidates = list (self.generate_init_candidates())
print ("There are %s initial candidates." % len (self.candidates))
freq_patterns = []
new_freq_patterns = self.filter_candidates (trans_min)
print ("The initial candidates have been filtered down to %s." % len (new_freq_patterns))
while True:
# is there anything left?
if new_freq_patterns:
freq_patterns = new_freq_patterns
else:
return freq_patterns
# if any left, generate new candidates & filter
self.candidates = self.generate_new_candidates ([x[0] for x in freq_patterns])
print ("There are %s new candidates." % len (self.candidates))
new_freq_patterns = self.filter_candidates (trans_min)
print ("The candidates have been filtered down to %s." % len (new_freq_patterns))
### END ###
__version__ = '0.1'
### CONSTANTS & DEFINES
NULL_SYMBOL = 'X'
### CODE ###
def HaystackSearch(needle, haystack):
"""
Return the index of the needle in the haystack
Parameters:
needle: any iterable
haystack: any other iterable
Returns:
the index of the start of needle or -1 if it is not found.
Looking for a sub-list of a list is actually a tricky thing. This
approach uses the Boyer-Moore-Horspool algorithm. Needle and haystack
should be any iterable, as long as their elements are hashable.
Example:
>>> find ([1, 2], [1, 1, 2])
1
>>> find ((1, 2, 3), range (10))
1
>>> find ('gh', 'abcdefghi')
6
>>> find ([2, 3], [7, 8, 9])
-1
"""
h = len (haystack)
n = len (needle)
skip = {needle[i]: n - i - 1 for i in range(n - 1)}
i = n - 1
while i < h:
for j in range(n):
if haystack[i - j] != needle[-j - 1]:
i += skip.get(haystack[i], n)
break
else:
return i - n + 1
return -1 |