Skip to content

Commit 0cfe54b

Browse files
committed
Merge remote-tracking branch 'origin/main'
2 parents 62cbe17 + 8422596 commit 0cfe54b

File tree

20 files changed

+12622
-113
lines changed

20 files changed

+12622
-113
lines changed
Lines changed: 243 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,243 @@
1+
# generateSequentialDatabase is a code used to generate sequential database.
2+
#
3+
# **Importing this algorithm into a python program**
4+
# --------------------------------------------------------
5+
# from PAMI.extras.generateDatabase import generateSequentialDatabase as db
6+
# obj = db(10,10, 5, 10)
7+
# obj.create()
8+
# obj.save('db.txt')
9+
# print(obj.getTransactions()) to get the transactional database as a pandas dataframe
10+
11+
# **Running the code from the command line**
12+
# --------------------------------------------------------
13+
# python generateDatabase.py 10 5 10 db.txt
14+
# cat db.txt
15+
#
16+
17+
18+
__copyright__ = """
19+
Copyright (C) 2024 Rage Uday Kiran
20+
21+
This program is free software: you can redistribute it and/or modify
22+
it under the terms of the GNU General Public License as published by
23+
the Free Software Foundation, either version 3 of the License, or
24+
(at your option) any later version.
25+
26+
This program is distributed in the hope that it will be useful,
27+
but WITHOUT ANY WARRANTY; without even the implied warranty of
28+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29+
GNU General Public License for more details.
30+
31+
You should have received a copy of the GNU General Public License
32+
along with this program. If not, see <https://www.gnu.org/licenses/>.
33+
"""
34+
35+
import math
36+
37+
import numpy as np
38+
import pandas as pd
39+
import sys
40+
41+
42+
class GeoReferentialSequentialDatabase:
43+
"""
44+
:Description Generate a sequential database with the given number of lines, average number of items per line, and total number of items
45+
46+
:Attributes:
47+
numSeq: int
48+
- number of sequences in database
49+
avgItemsetPerSeq:int
50+
- avarage number of itemset in one sequence
51+
avgItemsPeritemset: int
52+
- average number of items per itemset
53+
numItems: int
54+
- total kind of items
55+
maxItem: int(default:numItems)
56+
- maximum number of items per itemset
57+
maxItemset: int(default:avgItemsetPerSeq * 2)
58+
- maximum number of itemset per sequence
59+
seqSep: str
60+
- Separator for each item set
61+
62+
:Methods:
63+
create:
64+
Generate the transactional database
65+
save:
66+
Save the sequential database to a file
67+
getTransactions:
68+
Get the sequential database
69+
70+
71+
72+
73+
"""
74+
75+
def __init__(self, numSeq, avgItemsetPerSeq, avgItemsPerItemset, numItems,x1=0,y1=0,x2=100,y2=100, maxItem=0, maxItemset=0,
76+
seqSep="-1") -> None:
77+
"""
78+
Initialize the transactional database with the given parameters
79+
80+
"""
81+
82+
self.numSeq = numSeq
83+
self.avgItemsetPerSeq = avgItemsetPerSeq
84+
self.avgItemsPerItemset = avgItemsPerItemset
85+
self.numItems = numItems
86+
if maxItem == 0:
87+
self.maxItem = numItems
88+
else:
89+
self.maxItem = maxItem
90+
if maxItemset == 0:
91+
self.maxItemset = avgItemsetPerSeq * 2
92+
else:
93+
self.maxItemset = maxItemset
94+
self.seqSep = seqSep
95+
self.db = []
96+
numPoints = (x2 - x1) * (y2 - y1)
97+
if numItems > numPoints:
98+
raise ValueError("Number of points is less than the number of lines * average items per line")
99+
100+
self.itemPoint = {}
101+
102+
103+
for i in (range(1, numItems + 1)):
104+
# self.itemPoint[i] = (np.random.randint(x1, x2), np.random.randint(y1, y2))
105+
point = self.getPoint(x1, y1, x2, y2)
106+
while point in self.itemPoint:
107+
point = self.getPoint(x1, y1, x2, y2)
108+
self.itemPoint[i] = point
109+
110+
def tuning(self, array, sumRes) -> list:
111+
"""
112+
Tune the array so that the sum of the values is equal to sumRes
113+
114+
:param array: list of values
115+
116+
:type array: list
117+
118+
:param sumRes: the sum of the values in the array to be tuned
119+
120+
:type sumRes: int
121+
122+
:return: list of values with the tuned values and the sum of the values in the array to be tuned and sumRes is equal to sumRes
123+
124+
:rtype: list
125+
"""
126+
127+
while np.sum(array) != sumRes:
128+
# get index of largest value
129+
randIndex = np.random.randint(0, len(array))
130+
# if sum is too large, decrease the largest value
131+
if np.sum(array) > sumRes:
132+
array[randIndex] -= 1
133+
# if sum is too small, increase the smallest value
134+
else:
135+
minIndex = np.argmin(array)
136+
array[randIndex] += 1
137+
return array
138+
139+
def generateArray(self, nums, avg, maxItems) -> list:
140+
"""
141+
Generate a random array of length nums whose values average to avg
142+
143+
:param nums: number of values
144+
145+
:type nums: list
146+
147+
:param avg: average value
148+
149+
:type avg: float
150+
151+
:param maxItems: maximum value
152+
153+
:type maxItems: int
154+
155+
:return: random array
156+
157+
:rtype: list
158+
"""
159+
160+
# generate n random values
161+
values = np.random.randint(1, maxItems, nums)
162+
sumRes = nums * avg
163+
164+
values = self.tuning(values, sumRes)
165+
166+
# if any value is less than 1, increase it and tune the array again
167+
while np.any(values < 1):
168+
for i in range(nums):
169+
if values[i] < 1:
170+
values[i] += 1
171+
values = self.tuning(values, sumRes)
172+
173+
while np.any(values > maxItems):
174+
for i in range(nums):
175+
if values[i] > maxItems:
176+
values[i] -= 1
177+
values = self.tuning(values, sumRes)
178+
179+
# if all values are same then randomly increase one value and decrease another
180+
while np.all(values == values[0]):
181+
values[np.random.randint(0, nums)] += 1
182+
values = self.tuning(values, sumRes)
183+
184+
return values
185+
186+
def create(self, item="") -> None:
187+
"""
188+
:param item: list (default:generate random numItems items)
189+
item list to make database
190+
Generate the sequential database
191+
:return: None
192+
"""
193+
if item == "":
194+
item=self.itemPoint
195+
196+
db = set()
197+
sequences = self.generateArray(self.numSeq, self.avgItemsetPerSeq - 1, self.maxItemset)
198+
199+
for numItemset in sequences:
200+
seq = []
201+
values = self.generateArray(numItemset + 1, self.avgItemsPerItemset, self.maxItem)
202+
203+
for value in values:
204+
line = list(set(np.random.choice(item, value, replace=False)))
205+
seq = seq + line
206+
seq = seq + [self.seqSep]
207+
seq.pop()
208+
209+
self.db.append(seq)
210+
211+
def save(self, filename, sep="\t") -> None:
212+
"""
213+
Save the transactional database to a file
214+
215+
:param filename: name of the file
216+
217+
:type filename: str
218+
219+
:return: None
220+
"""
221+
222+
with open(filename, 'w') as f:
223+
for line in self.db:
224+
f.write(sep.join(map(str, line)) + '\n')
225+
226+
def getSequence(self) -> pd.DataFrame:
227+
"""
228+
Get the sequential database
229+
230+
:return: the sequential database
231+
232+
:rtype: pd.DataFrame
233+
"""
234+
df = pd.DataFrame(self.db)
235+
return df
236+
237+
238+
if __name__ == "__main__":
239+
# test the class
240+
db = GeoReferentialSequentialDatabase(10, 5, 5, 10)
241+
db.create()
242+
db.save('db.txt')
243+
print(db.getTransactions())

PAMI/extras/syntheticDataGenerator/GeoreferentialTemporalDatabase.py renamed to PAMI/extras/syntheticDataGenerator/GeoReferentialTemporalDatabase.py

Lines changed: 17 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import tqdm
88
import pandas as pd
99

10+
1011
class GeoReferentialTemporalDatabase:
1112
"""
1213
This class create synthetic geo-referential temporal database.
@@ -57,6 +58,7 @@ def __init__(
5758
self.seperator = sep
5859
self.occurrenceProbabilityOfSameTimestamp = occurrenceProbabilityOfSameTimestamp
5960
self.occurrenceProbabilityToSkipSubsequentTimestamp = occurrenceProbabilityToSkipSubsequentTimestamp
61+
self.current_timestamp=int()
6062
self._startTime = float()
6163
self._endTime = float()
6264
self._memoryUSS = float()
@@ -76,7 +78,7 @@ def __init__(
7678

7779
def getPoint(self, x1, y1, x2, y2):
7880

79-
return (np.random.randint(x1, x2), np.random.randint(y1, y2))
81+
return (np.random.randint(x1, x2),np.random.randint(y1, y2))
8082

8183
def performCoinFlip(self, probability: float) -> bool:
8284
"""
@@ -86,7 +88,7 @@ def performCoinFlip(self, probability: float) -> bool:
8688
:return: True if the coin lands heads, False otherwise.
8789
"""
8890
result = np.random.choice([0, 1], p=[1 - probability, probability])
89-
return result == 1
91+
return result
9092

9193
def tuning(self, array, sumRes) -> np.ndarray:
9294
"""
@@ -106,15 +108,14 @@ def tuning(self, array, sumRes) -> np.ndarray:
106108
"""
107109

108110
while np.sum(array) != sumRes:
109-
# get index of largest value
110-
randIndex = np.random.randint(0, len(array))
111111
# if sum is too large, decrease the largest value
112112
if np.sum(array) > sumRes:
113-
array[randIndex] -= 1
113+
maxIndex = np.argmax(array)
114+
array[maxIndex] -= 1
114115
# if sum is too small, increase the smallest value
115116
else:
116117
minIndex = np.argmin(array)
117-
array[randIndex] += 1
118+
array[minIndex] += 1
118119
return array
119120

120121
def generateArray(self, nums, avg, maxItems) -> np.ndarray:
@@ -139,7 +140,7 @@ def generateArray(self, nums, avg, maxItems) -> np.ndarray:
139140
"""
140141

141142
# generate n random values
142-
values = np.random.randint(1, maxItems, nums)
143+
values = np.random.randint(1, avg*1.5, nums)
143144

144145
sumRes = nums * avg
145146

@@ -172,39 +173,32 @@ def create(self) -> None:
172173
"""
173174
self._startTime = time.time()
174175
db = set()
175-
lineSize = [] #may be error. need to check it.
176-
sumRes = self.databaseSize * self.avgItemsPerTransaction # Total number of items
177176

177+
values = self.generateArray(self.databaseSize, self.avgItemsPerTransaction, self.numItems)
178+
178179
for i in range(self.databaseSize):
179180
# Determine the timestamp
180181
if self.performCoinFlip(self.occurrenceProbabilityOfSameTimestamp):
181182
timestamp = self.current_timestamp
182183
else:
183-
if self.performCoinFlip(self.occurrenceProbabilityToSkipSubsequentTimestamp):
184+
if self.performCoinFlip(self.occurrenceProbabilityToSkipSubsequentTimestamp)==1:
184185
self.current_timestamp += 2
185186
else:
186187
self.current_timestamp += 1
187188
timestamp = self.current_timestamp
188189

189190
self.db.append([timestamp]) # Start the transaction with the timestamp
190191

191-
lineSize.append([i, 0]) # Initialize lineSize with 0 for each transaction
192-
193-
# Adjust lineSize to ensure sum of sizes equals sumRes
194-
lineSize = self.tuning(lineSize, sumRes)
192+
195193

196194
# For each transaction, generate items
197-
for i in tqdm.tqdm(range(len(lineSize))):
198-
transaction_index = lineSize[i][0]
199-
num_items = lineSize[i][1]
195+
for i in tqdm.tqdm(range(self.databaseSize)):
200196

201-
if num_items > self.numItems:
202-
raise ValueError(
203-
"Error: Either increase numItems or decrease avgItemsPerTransaction or modify percentage")
204-
items = np.random.choice(range(1, self.numItems + 1), num_items, replace=False)
205-
self.db[transaction_index].extend(items)
197+
items = np.random.choice(range(1, self.numItems + 1), values[i], replace=False)
198+
nline = [self.itemPoint[i] for i in items]
199+
self.db[i].extend(nline)
206200

207-
self._runTime = time.time() - self._startTime
201+
self._endTime = time.time()
208202
process = psutil.Process(os.getpid())
209203
self._memoryUSS = process.memory_full_info().uss
210204
self._memoryRSS = process.memory_info().rss

0 commit comments

Comments
 (0)