Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
/data_preprocess/doc_index/
/data_preprocess/doc_index.zip
6 changes: 4 additions & 2 deletions .idea/TTDS.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

104 changes: 104 additions & 0 deletions data_preprocess/data_preprocess.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
import numpy as np
import json, re
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
import pickle

def preprocess(line, stopword_set=None):
output = []
# CASE FOLDING
lowercase_line = line.lower()
# TOKENIZATION
tokens = re.findall(r'\b[a-z0-9][a-z0-9]*', lowercase_line)
# STOPPING
# noneST_tokens = [x for x in tokens if not x in stopword_set]
# NORMALISATION
stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer("english")
# remove stopwords
tokens = [x for x in tokens if not x.lower() in stop_words]
stems = [stemmer.stem(x) for x in tokens]
output.extend(stems)
return output

# load json file
with open('recipes.json', 'r') as f:
dataset = json.load(f)

# 2-d dictionary of recipe index
dataset_index = {}
# 2-d dictionary for term frequency
term_frequency = {}
# 1-d document length
doc_len = {}
# integer for doc frequency
num_docs = 0


for data in dataset:
# print(f"id: {data['id']}")
# print(f"title: {data['title']}, ingredients: {data['ingredients']}, instructions: {data['instructions']}")
num_docs += 1
if num_docs % 10000 == 0:
print('current state: {}'.format(num_docs))
# store id

doc_id = data['id']

title = data['title']
instructions = ''
ingredients = ''
for d in data['instructions']:
instructions += d['text'] + ' '
for d in data['ingredients']:
ingredients += d['text'] + ' '
# print(title, ingredients, instructions)
# combine title and instructions

text = title + instructions + ingredients
processed_text = preprocess(text)
position_index = 1


# store doc_len
doc_len[doc_id] = len(processed_text)

for token in processed_text:

# term_frequency


# dataset_index
if token in dataset_index:
if doc_id in dataset_index[token]:
dataset_index[token][doc_id].append(position_index)
term_frequency[token][doc_id] += 1
else:
dataset_index[token][doc_id] = [position_index]
term_frequency[token][doc_id] = 1
else:
# create new token
term_frequency[token] = {}
term_frequency[token][doc_id] = 1
dataset_index[token] = {}
dataset_index[token][doc_id] = [position_index]
position_index += 1

# restriction for test
#if num_docs == 5:
#break

for key, value in dataset_index.items():
with open('./doc_index/' + key, 'wb') as f:
pickle.dump(value, f)

with open('./doc_index/term_frequency', 'wb') as f:
pickle.dump(term_frequency, f)

with open('./doc_index/doc_len', 'wb') as f:
pickle.dump(doc_len, f)

with open('./doc_index/num_docs', 'wb') as f:
pickle.dump(num_docs, f)
#print(dataset_index)
#print(len(dataset_index))
11 changes: 11 additions & 0 deletions data_preprocess/pickle_writer.py

Large diffs are not rendered by default.

File renamed without changes.
Binary file removed mysite/db.sqlite3
Binary file not shown.
2 changes: 1 addition & 1 deletion mysite/manage.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def main():
raise ImportError(
"Couldn't import Django. Are you sure it's installed and "
"available on your PYTHONPATH environment variable? Did you "
"forget to activate a virtual environment?"
"forget to activate a virtual environpython manage.py runserverment?"
) from exc
execute_from_command_line(sys.argv)

Expand Down
3 changes: 1 addition & 2 deletions mysite/mysite/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@

INSTALLED_APPS = [
'recipe',
'polls.apps.PollsConfig',
'django.contrib.admin',
'django.contrib.auth',
'django.contrib.contenttypes',
Expand Down Expand Up @@ -78,7 +77,7 @@
DATABASES = {
'default': {
'ENGINE': 'django.db.backends.sqlite3',
'NAME': BASE_DIR / 'db.sqlite3',
'NAME': BASE_DIR / 'recipes.db',
}
}

Expand Down
1 change: 0 additions & 1 deletion mysite/mysite/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,5 @@

urlpatterns = [
path('recipe/', include('recipe.urls')),
path('polls/', include('polls.urls')),
path('admin/', admin.site.urls),
]
5 changes: 0 additions & 5 deletions mysite/polls/admin.py

This file was deleted.

6 changes: 0 additions & 6 deletions mysite/polls/apps.py

This file was deleted.

32 changes: 0 additions & 32 deletions mysite/polls/migrations/0001_initial.py

This file was deleted.

Empty file.
23 changes: 0 additions & 23 deletions mysite/polls/models.py

This file was deleted.

21 changes: 0 additions & 21 deletions mysite/polls/templates/polls/detail.html

This file was deleted.

12 changes: 0 additions & 12 deletions mysite/polls/templates/polls/index.html

This file was deleted.

9 changes: 0 additions & 9 deletions mysite/polls/templates/polls/results.html

This file was deleted.

3 changes: 0 additions & 3 deletions mysite/polls/tests.py

This file was deleted.

24 changes: 0 additions & 24 deletions mysite/polls/urls.py

This file was deleted.

61 changes: 0 additions & 61 deletions mysite/polls/views.py

This file was deleted.

6 changes: 3 additions & 3 deletions mysite/recipe/admin.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from django.contrib import admin

# Register your models here.
from .models import Recipe, TokenData
from .models import Recipes


admin.site.register(Recipe)
admin.site.register(TokenData)
admin.site.register(Recipes)

Loading