From d07f723cddf849af720da15e245ea3342dc35530 Mon Sep 17 00:00:00 2001 From: Daniel Kislyuk Date: Tue, 13 Apr 2021 12:20:22 +0300 Subject: [PATCH 1/8] This commits adds two improved search functionalities. The first one is based on memcached and regex. The columns of interest are collected in a separate memcached store which is being populated via cron job. The query string is then parsed and transformed into a regex which is applied to the cache and retrieves the event ids. Second search is based on the Postgres full-text search functionality and hence is language specific. Separate tsvector columns are created in the db for Finnish, Swedish, and English. They are populated and kept up-to-date on the db side, see migration 0080. urls.py has a small correction of the imports, not related to the main topic of the commit. --- events/api.py | 194 ++++++++++++++---- .../commands/populate_local_event_cache.py | 30 ++- .../0080_populate_tsvectors_and_indices.py | 86 ++++++++ events/migrations/0081_event_local.py | 23 +++ events/models.py | 20 ++ events/tests/conftest.py | 43 ++-- events/tests/test_event_get.py | 37 ++++ .../templates/rest_framework/event_list.html | 46 ++++- linkedevents/settings.py | 5 +- linkedevents/urls.py | 2 +- requirements.in | 1 + requirements.txt | 1 + 12 files changed, 420 insertions(+), 68 deletions(-) create mode 100644 events/migrations/0080_populate_tsvectors_and_indices.py create mode 100644 events/migrations/0081_event_local.py diff --git a/events/api.py b/events/api.py index 7c7d328ec..f18881c35 100644 --- a/events/api.py +++ b/events/api.py @@ -15,8 +15,9 @@ import bleach import django_filters import pytz +import regex from django.conf import settings -from django.contrib.postgres.search import TrigramSimilarity +from django.contrib.postgres.search import SearchQuery, TrigramSimilarity from django.core.cache import caches from django.core.exceptions import PermissionDenied from django.db.models import Q, QuerySet @@ -369,18 +370,6 @@ def __init__(self, *args, **kwargs): del self.fields[key] del self.fields[field_name] - # def get_field(self, model_field): - # kwargs = {} - # if issubclass( - # model_field.__class__, - # (django_db_models.CharField, - # django_db_models.TextField)): - # if model_field.null: - # kwargs['allow_none'] = True - # kwargs['max_length'] = getattr(model_field, 'max_length') - # return fields.CharField(**kwargs) - # return super(TranslatedModelSerializer, self).get_field(model_field) - def to_representation(self, obj): ret = super(TranslatedModelSerializer, self).to_representation(obj) if obj is None: @@ -1748,32 +1737,154 @@ def parse_duration_string(duration): return int(val) * mul +def _terms_to_regex(terms, operator, fuzziness=3): + """ + Create a compiled regex from of the rpvided terms of the form + r'(\b(term1){e<2}')|(\b(term2){e<2})" This would match a string + with terms aligned in any order allowing two edits per term. + """ + + vals = terms.split(',') + valexprs = [r'(\b' + f'({val}){{e<{fuzziness}}})' for val in vals] + if operator == 'AND': + regex_join = '' + elif operator == 'OR': + regex_join = '|' + expr = f"{regex_join.join(valexprs)}" + return regex.compile(expr, regex.IGNORECASE) + + def _filter_event_queryset(queryset, params, srs=None): """ Filter events queryset by params - (e.g. self.request.query_params in EventViewSet) + (e.g. self.request.query_params ingit EventViewSet) """ # Filter by string (case insensitive). This searches from all fields # which are marked translatable in translation.py - val = params.get('text', None) + + val = params.get('local_ongoing_text', None) if val: - val = val.lower() - qset = Q() + language = params.get('language', 'fi') + langs = settings.FULLTEXT_SEARCH_LANGUAGES + if language not in langs.keys(): + raise ParseError(f"{language} not supported. Supported options are: {' '.join(settings.FULLTEXT_SEARCH_LANGUAGES)}") # noqa E501 + + query = SearchQuery(val, config=langs[language], search_type='plain') + kwargs = {f'search_vector_{language}': query} + queryset = queryset.filter(**kwargs).filter(end_time__gte=datetime.utcnow().replace(tzinfo=pytz.utc), + deleted=False, + local=True) + cache = caches['ongoing_events'] + val = params.get('local_ongoing_OR', None) + if val: + rc = _terms_to_regex(val, 'OR') + ids = {k for k, v in cache.get('local_ids').items() if rc.search(v, concurrent=True)} + queryset = queryset.filter(id__in=ids) - # Free string search from all translated event fields - event_fields = EventTranslationOptions.fields - for field in event_fields: - # check all languages for each field - qset |= _text_qset_by_translated_field(field, val) + val = params.get('local_ongoing_AND', None) + if val: + rc = _terms_to_regex(val, 'AND') + ids = {k for k, v in cache.get('local_ids').items() if rc.search(v, concurrent=True)} + queryset = queryset.filter(id__in=ids) - # Free string search from all translated place fields - place_fields = PlaceTranslationOptions.fields - for field in place_fields: - location_field = 'location__' + field - # check all languages for each field - qset |= _text_qset_by_translated_field(location_field, val) + val = params.get('internet_ongoing_AND', None) + if val: + rc = _terms_to_regex(val, 'AND') + ids = {k for k, v in cache.get('internet_ids').items() if rc.search(v, concurrent=True)} + queryset = queryset.filter(id__in=ids) - queryset = queryset.filter(qset) + val = params.get('internet_ongoing_OR', None) + if val: + rc = _terms_to_regex(val, 'OR') + ids = {k for k, v in cache.get('internet_ids').items() if rc.search(v, concurrent=True)} + queryset = queryset.filter(id__in=ids) + + val = params.get('all_ongoing', None) + if val and validate_bool(val, 'all_ongoing'): + ids = {k for i in cache.get_many(['internet_ids', 'local_ids']).values() for k, v in i.items()} + queryset = queryset.filter(id__in=ids) + + val = params.get('all_ongoing_AND', None) + if val: + rc = _terms_to_regex(val, 'AND') + cached_ids = {k: v for i in cache.get_many(['internet_ids', 'local_ids']).values() for k, v in i.items()} + ids = {k for k, v in cached_ids.items() if rc.search(v, concurrent=True)} + queryset = queryset.filter(id__in=ids) + + val = params.get('all_ongoing_OR', None) + if val: + rc = _terms_to_regex(val, 'OR') + cached_ids = {k: v for i in cache.get_many(['internet_ids', 'local_ids']).values() for k, v in i.items()} + ids = {k for k, v in cached_ids.items() if rc.search(v, concurrent=True)} + queryset = queryset.filter(id__in=ids) + + vals = params.get('keyword_set_AND', None) + if vals: + vals = vals.split(',') + keyword_sets = KeywordSet.objects.filter(id__in=vals) + for keyword_set in keyword_sets: + keywords = keyword_set.keywords.all() + qset = Q(keywords__in=keywords) + queryset = queryset.filter(qset) + + vals = params.get('keyword_set_OR', None) + if vals: + vals = vals.split(',') + keyword_sets = KeywordSet.objects.filter(id__in=vals) + all_keywords = set() + for keyword_set in keyword_sets: + keywords = keyword_set.keywords.all() + all_keywords.update(keywords) + + if 'local_ongoing_OR_set' in ''.join(params): + count = 1 + all_ids = [] + while f'local_ongoing_OR_set{count}' in params: + val = params.get(f'local_ongoing_OR_set{count}', None) + if val: + rc = _terms_to_regex(val, 'OR') + all_ids.append({k for k, v in cache.get('local_ids').items() if rc.search(v, concurrent=True)}) + count += 1 + ids = set.intersection(*all_ids) + queryset = queryset.filter(id__in=ids) + + if 'internet_ongoing_OR_set' in ''.join(params): + count = 1 + all_ids = [] + while f'internet_ongoing_OR_set{count}' in params: + val = params.get(f'internet_ongoing_OR_set{count}', None) + if val: + rc = _terms_to_regex(val, 'OR') + all_ids.append({k for k, v in cache.get('internet_ids').items() if rc.search(v, concurrent=True)}) + count += 1 + ids = set.intersection(*all_ids) + queryset = queryset.filter(id__in=ids) + + if 'all_ongoing_OR_set' in ''.join(params): + count = 1 + all_ids = [] + while f'all_ongoing_OR_set{count}' in params: + val = params.get(f'all_ongoing_OR_set{count}', None) + if val: + rc = _terms_to_regex(val, 'OR') + cached_ids = {k: v for i in cache.get_many(['internet_ids', 'local_ids']).values() for k, v in i.items()} # noqa E501 + all_ids.append({k for k, v in cached_ids.items() if rc.search(v, concurrent=True)}) + count += 1 + ids = set.intersection(*all_ids) + queryset = queryset.filter(id__in=ids) + + if 'keyword_OR_set' in ''.join(params): + rc = regex.compile('keyword_OR_set[0-9]*') + all_sets = rc.findall(''.join(params)) + for i in all_sets: + val = params.get(i, None) + if val: + val = val.split(',') + queryset = queryset.filter(keywords__pk__in=val) + + val = params.get('internet_based', None) + if val and validate_bool(val, 'internet_based'): + queryset = queryset.filter(location__id__contains='internet') # Filter by event translated fields and keywords combined. The code is # repeated as this is the first iteration, which will be replaced by a similarity @@ -1807,14 +1918,25 @@ def _filter_event_queryset(queryset, params, srs=None): qset = Q() queryset = queryset.filter(*qsets) - # This filtering param requires populate_local_event_cache management command - val = params.get('combined_local_ongoing', None) + val = params.get('text', None) if val: - cache = caches['ongoing_local'] val = val.lower() - vals = val.split(',') - ids = {k for k, v in cache.get('ids').items() if any(val in v for val in vals)} - queryset = queryset.filter(id__in=ids) + qset = Q() + + # Free string search from all translated event fields + event_fields = EventTranslationOptions.fields + for field in event_fields: + # check all languages for each field + qset |= _text_qset_by_translated_field(field, val) + + # Free string search from all translated place fields + place_fields = PlaceTranslationOptions.fields + for field in place_fields: + location_field = 'location__' + field + # check all languages for each field + qset |= _text_qset_by_translated_field(location_field, val) + + queryset = queryset.filter(qset) val = params.get('last_modified_since', None) # This should be in format which dateutil.parser recognizes, e.g. @@ -2016,7 +2138,7 @@ def _filter_event_queryset(queryset, params, srs=None): q = q | Q(in_language__id=lang) | Q(**name_arg) | Q(**desc_arg) | Q(**short_desc_arg) else: q = q | Q(in_language__id=lang) - queryset = queryset.filter(q) + queryset = queryset.filter(q).distinct() # Filter by in_language field only val = params.get('in_language', None) diff --git a/events/management/commands/populate_local_event_cache.py b/events/management/commands/populate_local_event_cache.py index a48db30c4..ce93360f6 100644 --- a/events/management/commands/populate_local_event_cache.py +++ b/events/management/commands/populate_local_event_cache.py @@ -9,10 +9,12 @@ class Command(BaseCommand): - help = "Update local ongoing and upcoming events cache." + help = "Update local and internet-based ongoing and upcoming events cache. Note that cache has to be set up and\ + its memory limits will probably need adjustment. In case memcached is used, check -m and\ + -I parameters." def handle(self, *args, **options): - cache = caches['ongoing_local'] + cache = caches['ongoing_events'] local_events = Event.objects.filter(location__divisions__ocd_id__endswith=MUNIGEO_MUNI, end_time__gte=datetime.utcnow().replace(tzinfo=pytz.utc), @@ -30,6 +32,26 @@ def handle(self, *args, **options): event_dict[i[0]].update(i[1:]) event_dict[i[0]].discard(None) - event_strings = {k: " ".join(v) for k, v in event_dict.items()} + event_strings = {k: " ".join(v).replace('\n', ' ').replace('\r', ' ') for k, v in event_dict.items()} + + cache.set('local_ids', event_strings) + + inet_events = Event.objects.filter(location__id__endswith='internet', + end_time__gte=datetime.utcnow().replace(tzinfo=pytz.utc), + deleted=False, + ).values_list('id', 'name', 'description', 'short_description', + 'name_en', 'description_en', 'short_description_en', + 'name_sv', 'description_sv', 'short_description_sv', + 'keywords__name_fi', 'keywords__name_sv', 'keywords__name_en', # noqa E501 + 'location__street_address_fi', 'location__street_address_sv', + 'location__name_fi', 'location__name_sv', 'location__name_en', # noqa E501 + 'location__description_fi', 'location__description_sv', + 'location__description_en') + event_dict = {i[0]: set() for i in inet_events} + for i in inet_events: + event_dict[i[0]].update(i[1:]) + event_dict[i[0]].discard(None) + + event_strings = {k: " ".join(v).replace('\n', ' ').replace('\r', ' ') for k, v in event_dict.items()} - cache.set('ids', event_strings) + cache.set('internet_ids', event_strings) diff --git a/events/migrations/0080_populate_tsvectors_and_indices.py b/events/migrations/0080_populate_tsvectors_and_indices.py new file mode 100644 index 000000000..95cc2ae9b --- /dev/null +++ b/events/migrations/0080_populate_tsvectors_and_indices.py @@ -0,0 +1,86 @@ +# Generated by Django 2.2.13 on 2021-03-19 09:44 +'''This migration adds language-specific tsvector columns, triggers to update them and indices + needed for the full-text search on events_event table. +''' +from django.db import migrations, models +from django.contrib.postgres.search import SearchVectorField + + +class Migration(migrations.Migration): + atomic = False + + dependencies = [ + ('events', '0079_add_search_vectors'), + ] + + operations = [ + migrations.SeparateDatabaseAndState( + database_operations=[ + migrations.RunSQL(sql= + ["UPDATE events_event SET search_vector_fi =" + "setweight(to_tsvector('finnish', coalesce(name_fi, '')), 'A') || setweight(to_tsvector('finnish', coalesce(short_description_fi, '')), 'B') || setweight(to_tsvector('finnish', coalesce(description_fi, '')), 'C');", + "UPDATE events_event SET search_vector_sv =" + "setweight(to_tsvector('finnish', coalesce(name_sv, '')), 'A') || setweight(to_tsvector('finnish', coalesce(short_description_sv, '')), 'B') || setweight(to_tsvector('finnish', coalesce(description_sv, '')), 'C');", + "UPDATE events_event SET search_vector_en =" + "setweight(to_tsvector('finnish', coalesce(name_en, '')), 'A') || setweight(to_tsvector('finnish', coalesce(short_description_en, '')), 'B') || setweight(to_tsvector('finnish', coalesce(description_en, '')), 'C');", + + "CREATE FUNCTION events_finnish_content_trigger_function() RETURNS trigger AS $$ " + "begin " + "new.search_vector_fi := " + "setweight(to_tsvector('pg_catalog.finnish', coalesce(new.name_fi,'')), 'A') || " + "setweight(to_tsvector('pg_catalog.finnish', coalesce(new.short_description_fi,'')), 'B') || " + "setweight(to_tsvector('pg_catalog.finnish', coalesce(new.description_fi,'')), 'C');" + "return new; " + "end " + "$$ LANGUAGE plpgsql; " + + "CREATE FUNCTION events_swedish_content_trigger_function() RETURNS trigger AS $$ " + "begin " + "new.search_vector_sv := " + "setweight(to_tsvector('pg_catalog.swedish', coalesce(new.name_sv,'')), 'A') || " + "setweight(to_tsvector('pg_catalog.swedish', coalesce(new.short_description_sv,'')), 'B') || " + "setweight(to_tsvector('pg_catalog.swedish', coalesce(new.description_sv,'')), 'C');" + "return new; " + "end " + "$$ LANGUAGE plpgsql; " + + "CREATE FUNCTION events_english_content_trigger_function() RETURNS trigger AS $$ " + "begin " + "new.search_vector_en := " + "setweight(to_tsvector('pg_catalog.english', coalesce(new.name_en,'')), 'A') || " + "setweight(to_tsvector('pg_catalog.english', coalesce(new.short_description_en,'')), 'B') || " + "setweight(to_tsvector('pg_catalog.english', coalesce(new.description_en,'')), 'C');" + "return new; " + "end " + "$$ LANGUAGE plpgsql; " + + "CREATE TRIGGER events_finnish_content_trigger BEFORE INSERT OR UPDATE ON events_event FOR EACH ROW EXECUTE PROCEDURE events_finnish_content_trigger_function();" + "CREATE TRIGGER events_swedish_content_trigger BEFORE INSERT OR UPDATE ON events_event FOR EACH ROW EXECUTE PROCEDURE events_swedish_content_trigger_function();" + "CREATE TRIGGER events_english_content_trigger BEFORE INSERT OR UPDATE ON events_event FOR EACH ROW EXECUTE PROCEDURE events_english_content_trigger_function();" + ], + + reverse_sql=[ + "DROP TRIGGER events_finnish_content_trigger ON events_event;" + "DROP FUNCTION events_finnish_content_trigger_function;" + "DROP TRIGGER events_swedish_content_trigger ON events_event;" + "DROP FUNCTION events_swedish_content_trigger_function;" + "DROP TRIGGER events_english_content_trigger ON events_event;" + "DROP FUNCTION events_english_content_trigger_function;" + ] + ) + ] + ), + migrations.SeparateDatabaseAndState( + database_operations=[ + migrations.RunSQL(sql="CREATE INDEX CONCURRENTLY events_finnish_content_index ON events_event USING GIN (search_vector_fi);", + reverse_sql="DROP INDEX events_finnish_content_index;" + ), + migrations.RunSQL(sql="CREATE INDEX CONCURRENTLY events_swedish_content_index ON events_event USING GIN (search_vector_sv);", + reverse_sql="DROP INDEX events_swedish_content_index;" + ), + migrations.RunSQL(sql="CREATE INDEX CONCURRENTLY events_english_content_index ON events_event USING GIN (search_vector_en);", + reverse_sql="DROP INDEX events_english_content_index;" + ) + ] + ) + ] diff --git a/events/migrations/0081_event_local.py b/events/migrations/0081_event_local.py new file mode 100644 index 000000000..a1ee3381b --- /dev/null +++ b/events/migrations/0081_event_local.py @@ -0,0 +1,23 @@ +# Generated by Django 2.2.13 on 2021-04-12 19:50 + +from django.db import migrations, models +from linkedevents.settings import MUNIGEO_MUNI + +def forwards_func(apps, schema_editor): + Event = apps.get_model("events", "Event") + Event.objects.filter(location__divisions__ocd_id__endswith=MUNIGEO_MUNI).update(local=True) + +class Migration(migrations.Migration): + + dependencies = [ + ('events', '0080_populate_tsvectors_and_indices'), + ] + + operations = [ + migrations.AddField( + model_name='event', + name='local', + field=models.BooleanField(default=False), + ), + migrations.RunPython(forwards_func), + ] diff --git a/events/models.py b/events/models.py index 21d877f42..ae62a5615 100644 --- a/events/models.py +++ b/events/models.py @@ -27,9 +27,12 @@ from django.contrib.contenttypes.models import ContentType from django.contrib.gis.db import models from django.contrib.postgres.fields import HStoreField +from django.contrib.postgres.indexes import Index +from django.contrib.postgres.search import SearchVectorField from django.contrib.sites.models import Site from django.core.mail import send_mail from django.db import transaction +from django.db.models import Q from django.db.models.signals import m2m_changed from django.dispatch import receiver from django.utils.encoding import python_2_unicode_compatible @@ -382,6 +385,12 @@ def can_be_edited_by(self, user): class Meta: verbose_name = _('keyword') verbose_name_plural = _('keywords') + indexes = [ + Index(name='keywords_index', + fields=('name', 'name_fi'), + condition=Q(n_events__gt=0), + ) + ] class KeywordSet(BaseModel, ImageMixin): @@ -639,6 +648,14 @@ class SuperEventType: keywords = models.ManyToManyField(Keyword, related_name='events') audience = models.ManyToManyField(Keyword, related_name='audience_events', blank=True) + # this field is redundant, but allows to avoid expensive joins when searching for local events + local = models.BooleanField(default=False, db_index=True) + + # these fields are populated and kept up to date by the db. See migration 0080 + search_vector_fi = SearchVectorField(null=True) + search_vector_en = SearchVectorField(null=True) + search_vector_sv = SearchVectorField(null=True) + class Meta: verbose_name = _('event') verbose_name_plural = _('events') @@ -688,6 +705,9 @@ def save(self, *args, **kwargs): str(self.audience.filter(deprecated=True).values('id')) + ". Please use up-to-date keywords.")}) + # if self.location__divisions__ocd_id__endswith == MUNIGEO_MUNI: + # self.local = True + super(Event, self).save(*args, **kwargs) # needed to cache location event numbers diff --git a/events/tests/conftest.py b/events/tests/conftest.py index 920469b82..a73bd5033 100644 --- a/events/tests/conftest.py +++ b/events/tests/conftest.py @@ -1,31 +1,28 @@ # -*- coding: utf-8 -*- -from datetime import timedelta, datetime +from datetime import datetime, timedelta +# 3rd party +import pytest +from django.conf import settings +from django.contrib.auth import get_user_model +from django.contrib.gis.geos import MultiPolygon, Point, Polygon # django from django.core.management import call_command -from django.contrib.auth import get_user_model from django.utils import timezone -from .utils import versioned_reverse as reverse -from .test_event_get import get_list -from django.contrib.gis.geos import Point, Polygon, MultiPolygon -from munigeo.models import (AdministrativeDivision, AdministrativeDivisionType, AdministrativeDivisionGeometry, - Municipality) - -# 3rd party -import pytest -from rest_framework.test import APIClient from django_orghierarchy.models import Organization +from munigeo.models import (AdministrativeDivision, + AdministrativeDivisionGeometry, + AdministrativeDivisionType, Municipality) +from rest_framework.test import APIClient +from events.api import KeywordSerializer, LanguageSerializer, PlaceSerializer # events -from events.models import ( - DataSource, Place, Language, Keyword, KeywordLabel, Event, - Offer, KeywordSet) -from events.api import ( - KeywordSerializer, PlaceSerializer, LanguageSerializer -) -from django.conf import settings +from events.models import (DataSource, Event, Keyword, KeywordLabel, + KeywordSet, Language, Offer, Place) from ..models import License, PublicationStatus +from .test_event_get import get_list +from .utils import versioned_reverse as reverse TEXT_FI = 'testaus' TEXT_SV = 'testning' @@ -543,11 +540,19 @@ def keyword_id(data_source, organization, kw_name, make_keyword_id): @pytest.mark.django_db @pytest.fixture def keyword_set(data_source, keyword, keyword2): - kw_set = KeywordSet.objects.create(data_source=data_source) + kw_set = KeywordSet.objects.create(data_source=data_source, name='name1', id='set:1') kw_set.keywords.set([keyword, keyword2]) return kw_set +@pytest.mark.django_db +@pytest.fixture +def keyword_set2(data_source, keyword3): + kw_set = KeywordSet.objects.create(data_source=data_source, name='name2', id='set:2') + kw_set.keywords.set([keyword3]) + return kw_set + + @pytest.mark.django_db @pytest.fixture def languages(): diff --git a/events/tests/test_event_get.py b/events/tests/test_event_get.py index 6c6d8a9ef..b7ee82644 100644 --- a/events/tests/test_event_get.py +++ b/events/tests/test_event_get.py @@ -75,6 +75,10 @@ def assert_event_fields_exist(data, version='v1'): 'videos', 'replaced_by', 'deleted', + 'local', + 'search_vector_sv', + 'search_vector_fi', + 'search_vector_en', ) if version == 'v0.1': fields += ( @@ -979,3 +983,36 @@ def test_keyword_and_text(api_client, event, event2, keyword): event.save() response = get_list(api_client, query_string='combined_text=lapset,aikuiset') assert_events_in_response([event], response) + + +@pytest.mark.django_db +def test_keywordset_search(api_client, event, event2, event3, keyword, keyword2, keyword3, + keyword_set, keyword_set2): + event.keywords.add(keyword, keyword3) + event.save() + event2.keywords.add(keyword2, keyword3) + event2.save() + event3.keywords.add(keyword, keyword2) + event3.save() + response = get_list(api_client, query_string='keyword_set_AND=set:1,set:2') + assert_events_in_response([event, event2], response) + response = get_list(api_client, query_string='keyword_set_OR=set:1,set:2') + assert_events_in_response([event, event2, event3], response) + event3.keywords.remove(keyword, keyword2) + event3.save() + response = get_list(api_client, query_string='keyword_set_AND=set:1,set:2') + assert_events_in_response([event, event2], response) + + +@pytest.mark.django_db +def test_keyword_OR_set_search(api_client, event, event2, event3, keyword, keyword2, keyword3, + keyword_set, keyword_set2): + event.keywords.add(keyword, keyword3) + event.save() + event2.keywords.add(keyword2, keyword3) + event2.save() + event3.keywords.add(keyword, keyword2) + event3.save() + load = f'keyword_OR_set1={keyword.id},{keyword2.id}&keyword_OR_set2={keyword3.id}' + response = get_list(api_client, query_string=load) + assert_events_in_response([event, event2], response) diff --git a/helevents/templates/rest_framework/event_list.html b/helevents/templates/rest_framework/event_list.html index 0c1de0908..6a5bf857b 100644 --- a/helevents/templates/rest_framework/event_list.html +++ b/helevents/templates/rest_framework/event_list.html @@ -4,6 +4,35 @@

Filtering retrieved events

Query parameters can be used to filter the retrieved events by the following criteria.

+

Ongoing local events

+

Use to quickly access local (municipality level) events that are upcoming or have not ended yet. Combines the search on a number of +description, name, and keyword fields. Locality is defined on the basis of MUNIGEO_MUNI value, which is set in the settings file. In the Helsinki case all the events would be retrieved that happen within Helsinki. Comes in two flavors: AND and OR. Use event/?local_ongoing_AND=lapset,musiiki to search for the events with both search terms in the description fields and ?local_ongoing_OR to search for the events with at least one term mentioned. In case you need to realize a more complicated logic and search for a combination of search terms as in (singing OR vocal) AND (workshop OR training) use ?local_ongoing_OR_setX parameter, where X is a number. +

Examples:

+
event/?local_ongoing_OR=lapsi,musiikki
+
+

See the result

+
event/?local_ongoing_OR_set1=lapsi,musiikki&local_ongoing_OR_set2=leiri,kurssi
+
+

See the result

+ +

Ongoing internet events

+

Use to quickly access internet-based events that are upcoming or have not ended yet. Usage is the same as for local ongoing events, three variations: ?internet_ongoing_AND, ?internet_ongoing_OR, and ?internet_ongoing_OR_setX, Note, that local_ongoing and internet_ongoing are mutually exclusive.

+

Example:

+
event/?internet_ongoing_AND=lapsi,musiiki
+
+

See the result

+

All ongoing events

+

All ongoing events, both internet and local combined. Usage is the same as for local ongoing events: ?all_ongoing_AND and ?all_ongoing_OR

+

Example:

+
event/?all_ongoing_AND=lapsa,musiiki
+
+

See the result

+

Internet based

+

Filter for all the events that happen in the internet, both past and upcoming.

+

Example:

+
event/?internet_based=true
+
+

See the result

Event time

Use start and end to restrict the date range of returned events. Any events that intersect with the given date range will be returned.

@@ -77,6 +106,7 @@

District

event/?division=malmi
 

See the result

+

Event category

To restrict the retrieved events by category, use the query parameter keyword, separating values by commas if you wish to @@ -94,6 +124,15 @@

Event category

event/?keyword=yso:p4354
 

See the result

+ +

Keyword set search

+

Some services maintain curated keyword sets, which can also be used in search with query parameters +keyword_set_AND and keyword_set_OR. As names of the keyword sets can repeat +between the services, ids should be supplied. Say, we have one keyword set Music with id "myservice:1" +that contains keywords rock and jazz, and another keyword set Workshops with keywords "workshop" and "seminar" +and id "myservice:2". Then a request /event/?keyword_set_AND=myservice:1,myservice:2 +would return the events matching the following expression: (rock OR jazz) AND (workshop OR seminar).

+

Event last modification time

To find events that have changed since you last polled Linkedevents API (to e.g. update your event cache), it is best to use the query parameter last_modified_since. This allows you to only return data @@ -125,13 +164,6 @@

Event text

event/?text=shostakovich
 

See the result

-

Combined search for local events

-

Use to quickly access local events that are upcoming or have not ended yet. Combines the search on a number of -description, name, and keyword fields

-

Example:

-
event/?combined_local_ongoing=rock
-
-

See the result

Event price

Events may or may not contain the offers field that lists event pricing. To return only free or non-free events, use the query parameteris_free. However, note that from some diff --git a/linkedevents/settings.py b/linkedevents/settings.py index 437c22b41..1d31bade8 100644 --- a/linkedevents/settings.py +++ b/linkedevents/settings.py @@ -492,7 +492,7 @@ def dummy_haystack_connection_for_lang(language_code): 'LOCATION': '127.0.0.1:11211', 'TIMEOUT': 300, }, - 'ongoing_local': { + 'ongoing_events': { 'BACKEND': 'django.core.cache.backends.memcached.MemcachedCache', 'LOCATION': '127.0.0.1:11211', 'TIMEOUT': None, @@ -501,3 +501,6 @@ def dummy_haystack_connection_for_lang(language_code): } } } + +# this is relevant for the fulltext search as implemented in _filter_event_queryset() +FULLTEXT_SEARCH_LANGUAGES = {'fi':'finnish', 'sv':'swedish', 'en':'english'} diff --git a/linkedevents/urls.py b/linkedevents/urls.py index d9b157aff..df9920211 100644 --- a/linkedevents/urls.py +++ b/linkedevents/urls.py @@ -1,4 +1,3 @@ -import debug_toolbar import environ from django.conf.urls import include, url from django.contrib import admin @@ -26,4 +25,5 @@ def get_redirect_url(self, *args, **kwargs): ] if env('DEBUG'): + import debug_toolbar urlpatterns.append(url(r'^__debug__/', include(debug_toolbar.urls))) diff --git a/requirements.in b/requirements.in index 4d3484c43..c75b483b5 100644 --- a/requirements.in +++ b/requirements.in @@ -41,6 +41,7 @@ python-Levenshtein python-memcached pytz rdflib +regex requests-cache requests>=2.20.0 sentry-sdk diff --git a/requirements.txt b/requirements.txt index e3f5b5ded..2a6f86dc1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -67,6 +67,7 @@ python3-openid==3.2.0 # via django-allauth pytz==2020.1 # via -r requirements.in, django, icalendar pyyaml==5.3.1 # via -r requirements.in, django-munigeo rdflib==5.0.0 # via -r requirements.in +regex==2020.11.13 # via -r requirements.in requests-cache==0.5.2 # via -r requirements.in, django-munigeo requests-oauthlib==1.3.0 # via django-allauth requests==2.24.0 # via -r requirements.in, django-allauth, django-anymail, django-helusers, django-munigeo, django-orghierarchy, httmock, pyjwkest, requests-cache, requests-oauthlib From ec3359b69fca933de2498b81fd3fdcc75daf9ce1 Mon Sep 17 00:00:00 2001 From: Daniel Kislyuk Date: Wed, 21 Apr 2021 11:34:39 +0300 Subject: [PATCH 2/8] LINK-264 Change the logic of the KeywordMatcher KeywordMatcher takes as input the strings of words, mostly without language specification, and has to transform them into a list of Keywords. This commit suggests the following logic: first an exactly matching KeywordLabel is searched for irrespective of the language. If no exact matches found, Postgres full-text search is used to find a label matched by lexeme. The results are ranked with TrigramSimilarity as ft SearchRank is not suitable for ranking matched individual words. If no language is passed we cycle through all the options as specified in FULLTEXT_SEARCH_LANGUAGES and select the best match according to similarity. If no match is found the string is checked for the possibility that it could be split and search for matches is repeated. --- events/api.py | 2 +- events/keywords.py | 160 +++++++----------- .../0082_keywordlabel_search_vectors.py | 46 +++++ events/models.py | 3 + 4 files changed, 113 insertions(+), 98 deletions(-) create mode 100644 events/migrations/0082_keywordlabel_search_vectors.py diff --git a/events/api.py b/events/api.py index f18881c35..ff2d0040b 100644 --- a/events/api.py +++ b/events/api.py @@ -1767,7 +1767,7 @@ def _filter_event_queryset(queryset, params, srs=None): language = params.get('language', 'fi') langs = settings.FULLTEXT_SEARCH_LANGUAGES if language not in langs.keys(): - raise ParseError(f"{language} not supported. Supported options are: {' '.join(settings.FULLTEXT_SEARCH_LANGUAGES)}") # noqa E501 + raise ParseError(f"{language} not supported. Supported options are: {' '.join(langs.values())}") query = SearchQuery(val, config=langs[language], search_type='plain') kwargs = {f'search_vector_{language}': query} diff --git a/events/keywords.py b/events/keywords.py index bc09fcde2..271e0bba4 100644 --- a/events/keywords.py +++ b/events/keywords.py @@ -1,112 +1,78 @@ -import re -from events.models import Keyword, KeywordLabel, DataSource -from difflib import get_close_matches +from django.conf import settings +from django.contrib.postgres.search import SearchQuery, TrigramSimilarity +from rest_framework.exceptions import ParseError -ENDING_PARENTHESIS_PATTERN = r' \([^)]+\)$' +from events.models import KeywordLabel class KeywordMatcher(object): def __init__(self): - label_to_keyword_ids = {} - self.name_to_keyword_ids = {} - for label_id, keyword_id in Keyword.alt_labels.through.objects.all().values_list( - 'keywordlabel_id', 'keyword_id'): - label_to_keyword_ids.setdefault(label_id, set()).add(keyword_id) - for label_id, name in KeywordLabel.objects.filter(language_id='fi').values_list( - 'id', 'name'): - self.name_to_keyword_ids[name.lower()] = label_to_keyword_ids.get(label_id, set()) - try: - yso_source = DataSource.objects.get(pk='yso') - self.skip = False - except DataSource.DoesNotExist: - print('No YSO keyword data source') - self.skip = True - return - for kid, preflabel in Keyword.objects.filter(data_source=yso_source).values_list( - 'id', 'name_fi'): - if preflabel is not None: - text = preflabel.lower() - self.name_to_keyword_ids.setdefault(text, set()).add(kid) - without_parenthesis = re.sub(ENDING_PARENTHESIS_PATTERN, '', text) - if without_parenthesis != text: - self.name_to_keyword_ids.setdefault(without_parenthesis, set()).add(kid) - self.labels = self.name_to_keyword_ids.keys() - print('Initialized', len(self.labels), 'keyword keys') + pass - def match(self, text): - if self.skip: - return None - wordsplit = re.compile(r'\s+') - # labels = KeywordLabel.objects - # match = labels.filter(name__iexact=text) + def full_text_matching(self, text, language=None): + used_langs = settings.FULLTEXT_SEARCH_LANGUAGES + if language: + if language not in used_langs.keys(): + raise ParseError(f"{language} not supported. Supported options are: {' '.join(used_langs.values())}") + languages = [language] + else: + languages = used_langs.keys() - text = text.lower() - if text == 'kokous': - text = 'kokoukset' - elif text == 'kuntoilu': - text = 'kuntoliikunta' - elif text == 'samba': - text = 'sambat' + contestants = {} + for language in languages: + query = SearchQuery(text, config=used_langs[language], search_type='plain') + kwargs = {f'search_vector_{language}': query} - labels = self.labels - matches = [l for l in labels if l.lower() == text] - if matches: - match_type = 'exact' - if not matches: - words = wordsplit.split(text) - if len(words) > 1: - for word in words: - matches.extend([l for l in labels if l.lower() == word]) - match_type = 'subword' # Later attempts will override, if this wasn't a match - if not matches: - matches = [l for l in labels if l.lower().startswith(text)] - match_type = 'prefix' - if not matches: - matches = [l for l in labels if l.lower() == text + 't'] - match_type = 'simple-plural' - if not matches: - matches = [l for l in labels if l.lower().startswith(text[0:-2])] - match_type = 'cut-two-letters' - if not matches: - if len(text) > 10: - matches = [l for l in labels if l.lower().startswith(text[0:-5])] - match_type = 'prefix' - if not matches: - for i in range(1, 10): - matches = [l for l in labels if l.lower() == text[i:]] - if matches: - match_type = 'suffix' - break + # find matches via search vector and choose the best one according to trgrm similarity + label = KeywordLabel.objects.filter(**kwargs).annotate(similarity=TrigramSimilarity('name', text) + ).order_by('-similarity' # noqa E124 + ).first() # noqa E124 + # storing the result in a dictionary of the following structure {similarity: label} + # in the edge case when similarity is the same for two different languages the label will be + # overwritten, which is not big deal as we don't have a way to select between the two anyway. + if label: + contestants[label.similarity] = label - if not matches: - print('no match', text) + # selecting the match with the highest similarity, if there is anything to select + if contestants.keys(): + return contestants[max(contestants.keys())] + else: return None - keyword_ids = set() - if match_type not in ['exact', 'subword']: - cmatch = get_close_matches( - text, [m.lower() for m in matches], n=1) - if len(cmatch) == 1: - keyword_ids = self.name_to_keyword_ids.get(cmatch[0]) + def label_match(self, text, language=None): + # Looking for an exact match regardless of the language + label = KeywordLabel.objects.filter(name__iexact=text) + if label: + return label - else: - for m in matches: - keyword_ids.update(self.name_to_keyword_ids[m]) + # If no exact matches found, let's use Postgres full-text search + # to find a label matched by lexeme and rank the results with + # TrigramSimilarity as ft SearchRank is not suitable for ranking matched + # individual words. If no language is passed we cycle through all + # the options as specified in FULLTEXT_SEARCH_LANGUAGES and select + # the best match according to similarity. + label = self.full_text_matching(text, language) + if label: + return [label] - if len(keyword_ids) < 1: - print('no matches for', text) + # if no matches found let's check if we could split the string + texts = text.split() + if len(texts) > 1: + all_the_labels = [] + for word in texts: + label = self.label_match(word, language) + if label: + all_the_labels.extend(label) + return all_the_labels if all_the_labels else None + else: return None - objects = Keyword.objects.filter(id__in=keyword_ids, deprecated=False) - if len(keyword_ids) > 1: - try: - aggregate_keyword = objects.get(aggregate=True) - aggregate_name = re.sub(ENDING_PARENTHESIS_PATTERN, '', aggregate_keyword.name_fi) - result = [aggregate_keyword] - for o in objects.exclude(name_fi__istartswith=aggregate_name): - result.append(o) - return result - except Keyword.DoesNotExist: - pass - return objects - return objects + def match(self, text, language=None): + labels = self.label_match(text, language) + keywords = [] + if labels: + for label in labels: + keywords.extend(label.keywords.all()) + return keywords + else: + return None diff --git a/events/migrations/0082_keywordlabel_search_vectors.py b/events/migrations/0082_keywordlabel_search_vectors.py new file mode 100644 index 000000000..34f095ce9 --- /dev/null +++ b/events/migrations/0082_keywordlabel_search_vectors.py @@ -0,0 +1,46 @@ +# Generated by Django 2.2.13 on 2021-04-20 05:28 + +import django.contrib.postgres.search +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('events', '0081_event_local'), + ] + + operations = [ + migrations.AddField( + model_name='keywordlabel', + name='search_vector_en', + field=django.contrib.postgres.search.SearchVectorField(null=True), + ), + migrations.AddField( + model_name='keywordlabel', + name='search_vector_fi', + field=django.contrib.postgres.search.SearchVectorField(null=True), + ), + migrations.AddField( + model_name='keywordlabel', + name='search_vector_sv', + field=django.contrib.postgres.search.SearchVectorField(null=True), + ), + migrations.SeparateDatabaseAndState( + database_operations=[ + migrations.RunSQL(sql=["UPDATE events_keywordlabel SET search_vector_fi = to_tsvector('finnish', name) WHERE language_id='fi';" + "UPDATE events_keywordlabel SET search_vector_en = to_tsvector('english', name) WHERE language_id='en';" + "UPDATE events_keywordlabel SET search_vector_sv = to_tsvector('swedish', name) WHERE language_id='sv';" + "CREATE TRIGGER fi_trigger BEFORE INSERT OR UPDATE ON events_keywordlabel FOR EACH ROW WHEN (NEW.language_id='fi') EXECUTE PROCEDURE tsvector_update_trigger(search_vector_fi, 'pg_catalog.finnish', name);" + "CREATE TRIGGER en_trigger BEFORE INSERT OR UPDATE ON events_keywordlabel FOR EACH ROW WHEN (NEW.language_id='en') EXECUTE PROCEDURE tsvector_update_trigger(search_vector_en, 'pg_catalog.english', name);" + "CREATE TRIGGER sv_trigger BEFORE INSERT OR UPDATE ON events_keywordlabel FOR EACH ROW WHEN (NEW.language_id='sv') EXECUTE PROCEDURE tsvector_update_trigger(search_vector_sv, 'pg_catalog.swedish', name);" + ], + reverse_sql=["DROP TRIGGER fi_trigger ON events_keywordlabel;" + "DROP TRIGGER en_trigger ON events_keywordlabel;" + "DROP TRIGGER sv_trigger ON events_keywordlabel;" + ] + ) + ] + + ) + ] diff --git a/events/models.py b/events/models.py index ae62a5615..b696d510c 100644 --- a/events/models.py +++ b/events/models.py @@ -274,6 +274,9 @@ class Meta: class KeywordLabel(models.Model): name = models.CharField(verbose_name=_('Name'), max_length=255, db_index=True) language = models.ForeignKey(Language, on_delete=models.CASCADE, blank=False, null=False) + search_vector_fi = SearchVectorField(null=True) + search_vector_en = SearchVectorField(null=True) + search_vector_sv = SearchVectorField(null=True) def __str__(self): return self.name + ' (' + str(self.language) + ')' From 64e71aed457aeecfcfa49204b7bb9d9f38e30a8f Mon Sep 17 00:00:00 2001 From: Jari Turkia Date: Thu, 29 Apr 2021 14:53:24 +0300 Subject: [PATCH 3/8] =?UTF-8?q?Initial=20version=20of=20Vapaaehtoisty?= =?UTF-8?q?=C3=B6.fi=20API=20importer?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 66 ++++ events/importer/helper/__init__.py | 0 events/importer/helper/importers/__init__.py | 0 .../importers/vapaaehtoistyofi/__init__.py | 2 + .../importers/vapaaehtoistyofi/reader.py | 116 +++++++ .../importers/vapaaehtoistyofi/record.py | 135 ++++++++ events/importer/vapaaehtoistyofi.py | 303 ++++++++++++++++++ linkedevents/settings.py | 5 + 8 files changed, 627 insertions(+) create mode 100644 events/importer/helper/__init__.py create mode 100644 events/importer/helper/importers/__init__.py create mode 100644 events/importer/helper/importers/vapaaehtoistyofi/__init__.py create mode 100644 events/importer/helper/importers/vapaaehtoistyofi/reader.py create mode 100644 events/importer/helper/importers/vapaaehtoistyofi/record.py create mode 100644 events/importer/vapaaehtoistyofi.py diff --git a/README.md b/README.md index ce46774c2..250ae34c4 100644 --- a/README.md +++ b/README.md @@ -72,6 +72,7 @@ Install required Python packages into the virtualenv ```bash cd $INSTALL_BASE/linkedevents pip install -r requirements.txt +pip install -r requirements-dev.txt ``` Create the database, like so: (we have only tested on PostgreSQL) ```bash @@ -97,6 +98,8 @@ The last steps are needed if you wish to use location, address or event data fro The commands below are documented in more detail in [linked-events-importers.md](./linked-events-importers.md#linked-events-importers-and-commands). +**Note**: Running below commands to import all data will take several hours to complete. + ```bash cd $INSTALL_BASE/linkedevents # Import general Finnish ontology (used by Helsinki UI and Helsinki events) @@ -130,6 +133,69 @@ For further erudition, take a look at the DRF documentation on [customizing the After this, everything but search endpoint (/search) is working. See [search](#search) +## `local_settings.py` +Create this file from `local_settings.py.template` and edit it to contain +settings specific for your own workstation + +### Example +Assumptions: +* Windows 10 workstation +* Django running on Windows for development +* Database running PostGIS on a Docker container accessible at _localhost:5555_ +* Installed GeoDjango into `C:\OSGeo4W64\` + +File contents are: +```python +import os + +DEBUG = True + +DATABASES = { + 'default': { + 'ENGINE': 'django.contrib.gis.db.backends.postgis', + 'NAME': 'linkedevents', + 'USER': 'linkedevents', + 'PASSWORD': 'linkedevents', + 'HOST': 'localhost', + 'PORT': 5555 + } +} + +CUSTOM_MAPPINGS = { + 'autosuggest': { + 'search_analyzer': 'standard', + 'index_analyzer': 'edgengram_analyzer', + 'analyzer': None + }, + 'text': { + 'analyzer': 'default' + } +} + +GDAL_LIBRARY_PATH = r'C:\OSGeo4W64\bin\gdal204' +GEOS_LIBRARY_PATH = r'C:\OSGeo4W64\bin\geos_c' +GDAL_DATA = r'C:\OSGeo4W64\share\epsg_csv' +os.environ["GDAL_DATA"] = GDAL_DATA +``` + +### GeoDjango + +#### Linux +tbd + +#### macOS +tbd + +#### Windows +1. Go to https://trac.osgeo.org/osgeo4w/ +1. Download appropriate installation binaries +1. Install +1. Edit `local_settings.py` to contain `GDAL_LIBRARY_PATH = r'C:\OSGeo4W64\bin\gdal204'` +1. For any WFS-access with HTTPS to work (example: Osoite-import), need to have environment variable + `CURL_CA_BUNDLE=C:\OSGeo4W64\bin\curl-ca-bundle.crt` +1. Done! + + Production notes ---------------- diff --git a/events/importer/helper/__init__.py b/events/importer/helper/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/events/importer/helper/importers/__init__.py b/events/importer/helper/importers/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/events/importer/helper/importers/vapaaehtoistyofi/__init__.py b/events/importer/helper/importers/vapaaehtoistyofi/__init__.py new file mode 100644 index 000000000..8f5bbc9f7 --- /dev/null +++ b/events/importer/helper/importers/vapaaehtoistyofi/__init__.py @@ -0,0 +1,2 @@ +from .reader import Reader +from .record import Record diff --git a/events/importer/helper/importers/vapaaehtoistyofi/reader.py b/events/importer/helper/importers/vapaaehtoistyofi/reader.py new file mode 100644 index 000000000..322673a43 --- /dev/null +++ b/events/importer/helper/importers/vapaaehtoistyofi/reader.py @@ -0,0 +1,116 @@ +import requests +import logging +from .record import Record + +log = logging.getLogger(__name__) + + +class Reader: + endpoint_url = 'https://apiv2.vapaaehtoistyo.fi' + rest_user_agent = 'HelsinkiVETImporter/0.1' + timeout = 5.0 + cached_entries = True + + def __init__(self, api_key): + self.api_key = api_key + + if self.cached_entries: + cnt_entries, data = self.load_entries() + self.entries = {} + for record in data: + id = record.id + self.entries[id] = record + + def _setup_client(self): + headers = { + 'Accept': 'application/json', + 'User-Agent': self.rest_user_agent, + "Authorization": "Bearer %s" % self.api_key + } + + s = requests.Session() + s.headers.update(headers) + + return s + + def load_entry(self, id): + if not self.cached_entries: + return self._load_entry_api(id) + + if not id in self.entries: + return False + + return self.entries[id] + + def _load_entry_api(self, id): + http_client = self._setup_client() + url = "%s/task/%s" % (self.endpoint_url, id) + response = http_client.get(url, timeout=self.timeout) + if response.status_code != 200: + raise RuntimeError("Failed to request data from Vapaaehtoistyö.fi API! HTTP/%d" % + response.status_code) + + data = response.json() + if 'status' not in data or data['status'] != "ok": + raise RuntimeError("Vapaaehtoistyö.fi response isn't ok!") + if 'data' not in data: + raise RuntimeError("Vapaaehtoistyö.fi response doesn't contain 'data'!") + data_obj = Record(data) + + return data_obj + + def load_entries(self): + http_client = self._setup_client() + page = 1 + total_records = None + batch_size = 1 + ret = [] + while batch_size: + url = "%s/collection/task?page=%d" % (self.endpoint_url, page) + response = http_client.get(url, timeout=self.timeout) + + if response.status_code != 200: + raise RuntimeError("Failed to request data from Vapaaehtoistyö.fi API! HTTP/%d" % + response.status_code) + data = response.json() + if 'status' not in data or data['status'] != "ok": + raise RuntimeError("Vapaaehtoistyö.fi response isn't ok!") + if 'data' not in data: + raise RuntimeError("Vapaaehtoistyö.fi response doesn't contain 'data'!") + if not total_records: + total_records = int(data['data']['totalRecords']) + + batch_size = len(data['data']['records']) + for data in data['data']['records']: + data_obj = Record(data) + ret.append(data_obj) + page += 1 + + return total_records, ret + + def load_photo(self, id): + http_client = self._setup_client() + url = "%s/collection/task-photo/%s" % (self.endpoint_url, id) + response = http_client.get(url, timeout=self.timeout) + if response.status_code != 200: + if response.status_code == 404: + # No photo for this event + return None, None + + raise RuntimeError("Failed to request data from Vapaaehtoistyö.fi API! HTTP/%d" % + response.status_code) + + data = response.json() + if 'status' not in data or data['status'] != "ok": + raise RuntimeError("Vapaaehtoistyö.fi response isn't ok!") + if 'data' not in data: + #log.error("%s: %s" % (id, data)) + #raise RuntimeError("Vapaaehtoistyö.fi response doesn't contain 'data'!" + log.warning("Requested photo for %s, but didn't receive any data!" % id) + return None, None + if data['data']['photo']['type'] != "Buffer": + raise RuntimeError("Vapaaehtoistyö.fi response isn't ok!") + mime_type = data['data']['mimetype'] + photo = bytearray(data['data']['photo']['data']) + + return mime_type, photo diff --git a/events/importer/helper/importers/vapaaehtoistyofi/record.py b/events/importer/helper/importers/vapaaehtoistyofi/record.py new file mode 100644 index 000000000..4c9c45834 --- /dev/null +++ b/events/importer/helper/importers/vapaaehtoistyofi/record.py @@ -0,0 +1,135 @@ +import datetime + + +class Record: + STATUS_PUBLISHED = 1 + STATUS_DRAFT = 0 + STATUS = [STATUS_PUBLISHED, STATUS_DRAFT] + + LOCALE_FI = 'fi' + LOCALE_SE = 'se' + LOCALE_EN = 'en' + LOCALE = [LOCALE_FI, LOCALE_SE, LOCALE_EN] + + TASK_URL_FORMAT = 'https://vapaaehtoistyo.fi/%s/task/%s' + + def __init__(self, json_dict): + if json_dict: + self.id = json_dict['id'] + self.organization_id = json_dict['organization'] + self.organization_name = json_dict['organizationName'] + self.title = json_dict['title'] + self.address = json_dict['address'] + self.address_coordinates = { + 'lat': json_dict['addressCoordinates']['lat'], + 'lon': json_dict['addressCoordinates']['lng'] + } + self.tags = [] + for theme in json_dict['themes']: + theme_id = theme["id"] + theme_name = theme["name"] + self.tags.append({theme_id: theme_name}) + + self.timestamp_start = datetime.datetime.utcfromtimestamp(json_dict['timeStampStartTask']) + self.timestamp_end = datetime.datetime.utcfromtimestamp(json_dict['timeStampEndTask']) + if json_dict['noActualTime']: + self.no_time = True + else: + self.no_time = False + self.description = json_dict['description'] + self.contact_details = json_dict['contactDetails'] + self.themes = json_dict['id'] + self.timestamp_publish = datetime.datetime.utcfromtimestamp(json_dict['timeStampPublicationDate']) + # self.publicationTime = json_dict['id'] + if json_dict['status'] in self.STATUS: + self.status = json_dict['id'] + else: + raise RuntimeError("Unknown status %d!" % json_dict['status']) + self.creator_id = json_dict['creator'] + if json_dict['timeStampAdded']: + self.timestamp_inserted = datetime.datetime.utcfromtimestamp(json_dict['timeStampAdded']) + else: + self.timestamp_inserted = None + if json_dict['timeStampLastUpdated']: + self.timestamp_updated = datetime.datetime.utcfromtimestamp(json_dict['timeStampLastUpdated']) + else: + self.timestamp_updated = None + else: + self.id = None + self.organization_id = None + self.organization_name = None + self.title = None + self.address = None + self.address_coordinates = None + self.tags = [] + + self.no_time = None + self.timestamp_start = None + self.timestamp_end = None + + self.description = None + self.contact_details = None + self.themes = None + self.timestamp_publish = None + + self.status = None + self.creator_id = None + self.timestamp_inserted = None + self.timestamp_updated = None + + def get_url_locale(self, locale): + if locale not in self.LOCALE: + raise ValueError("Unknown locale '%s'!" % locale) + return self.TASK_URL_FORMAT % (locale, self.id) + + def __copy__(self) -> object: + newrecord = Record(None) + newrecord.id = self.id + newrecord.organization_id = self.organization_id + newrecord.organization_name = self.organization_name + newrecord.title = self.title + newrecord.address = self.address + newrecord.address_coordinates = self.address_coordinates + newrecord.tags = self.tags + + newrecord.no_time = self.no_time + newrecord.timestamp_start = self.timestamp_start + newrecord.timestamp_end = self.timestamp_end + + newrecord.description = self.description + newrecord.contact_details = self.contact_details + newrecord.themes = self.themes + newrecord.timestamp_publish = self.timestamp_publish + + newrecord.status = self.status + newrecord.creator_id = self.creator_id + newrecord.timestamp_inserted = self.timestamp_inserted + newrecord.timestamp_updated = self.timestamp_updated + + return newrecord + + def to_dict(self): + newrecord = { + "id": self.id, + "organization_id": self.organization_id, + "organization_name": self.organization_name, + "title": self.title, + "address": self.address, + "address_coordinates": self.address_coordinates, + + "no_time": self.no_time, + "timestamp_start": self.timestamp_start, + "timestamp_end": self.timestamp_end, + + "description": self.description, + "contact_details": self.contact_details, + "themes": self.themes, + "timestamp_publish": self.timestamp_publish, + + "status": self.status, + "creator_id": self.creator_id, + "timestamp_inserted": self.timestamp_inserted, + "timestamp_updated": self.timestamp_updated + } + + return newrecord diff --git a/events/importer/vapaaehtoistyofi.py b/events/importer/vapaaehtoistyofi.py new file mode 100644 index 000000000..ffca42ffe --- /dev/null +++ b/events/importer/vapaaehtoistyofi.py @@ -0,0 +1,303 @@ +# -*- coding: utf-8 -*- +import logging + +import pytz +from datetime import datetime, timedelta +from django import db +from django.conf import settings +from django.contrib.gis.geos import Point, GEOSGeometry +from django.contrib.gis.db.models.functions import Distance +from django.core.management import call_command +from django_orghierarchy.models import Organization +from django.utils import timezone as django_timezone +from pytz import timezone +import bleach +import base64 + +from events.importer.util import replace_location +from events.models import ( + DataSource, + Event, + Keyword, + Place, + License +) +from .sync import ModelSyncher +from .base import Importer, recur_dict, register_importer +from .util import clean_text +from events.importer.helper.importers import vapaaehtoistyofi + +# Per module logger +logger = logging.getLogger(__name__) + + +@register_importer +class VapaaehtoistyofiImporter(Importer): + importer_id = 'vapaaehtoistyofi' + name = 'vapaaehtoistyö.fi' + supported_languages = ['fi', 'sv', 'en'] + ok_tags = ['u', 'b', 'h2', 'h3', 'em', 'ul', 'li', 'strong', 'br', 'p', 'a'] + DEFAULT_DURATION_DAYS = 90 + LOCAL_TZ = timezone('Europe/Helsinki') + UTC_TZ = timezone('UTC') + + VET_KEYWORD_ID = "p3050" + KEYWORDS = { + "": VET_KEYWORD_ID, # https://finto.fi/yso/fi/page/p3050 + "67b79307eeb3170e4a73de97d7db25c1019969c0": "p4785", # Järjestötoiminta + "ee1522536826c4245203c832893bb710ecf26967": ["p4354", "p11617", "p4363"], # Lapset, nuoret ja perheet + "ba1a23bf86b46b59c31eda2a64871b516d31c6d7": "p26028", # Ystävä- ja kaveritoiminta + "21a2dd8eb252b77696873c225dc404e92d49a88b": "p2573", # Also for English speakers + "8b51c2d692707464b31d04954b843cbb808ea356": "p2433", # Ikäihmiset + "9d3f2f277cce1c52334409e42c957eabfce33095": ["p965", "p916", "p2771"], # Urheilu, liikunta ja ulkoilu + "33a38af4e473ea1de6cbfd227e512570d88d2425": "p4400", # Sopii alle 18-vuotiaille + "5eb6379a96c3b01b4126cfc9fc862a7d50b55aa9": "p22112", # Maahanmuuttajat + "655171ca3998761dac17d195b24b67890f3f58b3": "p10801", # Etätyö puhelimitse tai verkossa + "1cd7fe40093fe69631be00e07efac3cb1109bb8e": "p229", # Seurakuntatoiminta + "73a7a8af91c9375921ff8b6f96e2ccb3de620869": ["p8660", "p11", "p2023"], # Ympäristö, luonnonsuojelu ja eläimet + "83ac738e5bdaae0de9ce6a03375e78c993c87595": ["p2108", "p6904"], # Tapahtumat ja talkoot + "7aad71eb0bc6ad2a4a1b5bdfe5c9e99a5046892f": ["p7179", "p17354"], # Vammaiset ja muut erityisryhmät + "c359cf2be5277f1cab72c9f487bd11df5a6f7758": ["p6206", "p7913"], # Päihde- ja mielenterveystyö + "f8a74c0d3582666e58423cb34746189a5b3b4043": [ + "p10190", "p15322", "p20819" + ], # Päivystys, ensiapu, pelastus ja kriisityö + "fa68d1dded69d8cb764c5d3a18449ae90cf10210": ["p1808", "p2851", "p4923"], # Musiikki, taide ja käsityöt + "f1d1b5fb6c50e5fadee65b85cae945ea891c9e1d": "p38829", # Korona + } + + def setup(self): + ds_args = dict(id=self.importer_id) + defaults = dict(name='Vapaaehtoistyö.fi') + self.data_source, _ = DataSource.objects.get_or_create(defaults=defaults, **ds_args) + + org_args = dict(origin_id='u021600', data_source=self.data_source) + defaults = dict(name='Vapaaehtoistyö.fi') + self.organization, _ = Organization.objects.get_or_create(defaults=defaults, **org_args) + + self.vetf_source = vapaaehtoistyofi.Reader(settings.VAPAAEHTOISTYOFI_API_KEY) + + try: + self.event_only_license = License.objects.get(id='event_only') + except License.DoesNotExist: + self.event_only_license = None + + def pk_get(self, resource_name, res_id=None): + logger.debug("pk_get(%s, %s)" % (resource_name, res_id)) + record = self.vetf_source.load_entry(res_id) + + return record + + def delete_and_replace(self, obj): + obj.deleted = True + obj.save(update_fields=['deleted']) + # we won't stand idly by and watch Vapaaehtoistyö.fi delete needed units willy-nilly without raising a ruckus! + if obj.events.count() > 0: + # try to replace by Vapaaehtoistyö.fi and, failing that, matko + replaced = replace_location(replace=obj, by_source=self.importer_id) + if not replaced: + # matko location may indeed be deleted by an earlier iteration + replaced = replace_location(replace=obj, by_source='matko', include_deleted=True) + if not replaced: + # matko location may never have been imported in the first place, do it now! + call_command('event_import', 'matko', places=True, single=obj.name) + replaced = replace_location(replace=obj, by_source='matko') + if not replaced: + logger.warning("Vapaaehtoistyö.fi deleted location %s (%s) with events." + "No unambiguous replacement was found. " + "Please look for a replacement location and save it in the replaced_by field. " + "Until then, events will stay mapped to the deleted location." % + (obj.id, str(obj))) + return True + + def mark_deleted(self, obj): + if obj.deleted: + return False + return self.delete_and_replace(obj) + + def check_deleted(self, obj): + return obj.deleted + + def _import_event(self, event_obj): + event = event_obj.to_dict() + logger.debug("Task id %s" % event_obj.id) + event['id'] = '%s:%s' % (self.data_source.id, event_obj.id) + event['origin_id'] = event_obj.id + event['data_source'] = self.data_source + event['publisher'] = self.organization + event['headline'] = {} + event['description'] = {} + + title = bleach.clean(event_obj.title, tags=[], strip=True) + # long description is html formatted, so we don't want plain text whitespaces + title = clean_text(title, True) + Importer._set_multiscript_field(title, event, event_obj.LOCALE, 'headline') + + desc = bleach.clean(event_obj.description, tags=self.ok_tags, strip=True) + # long description is html formatted, so we don't want plain text whitespaces + desc = clean_text(desc, True) + Importer._set_multiscript_field(desc, event, event_obj.LOCALE, 'description') + + now = datetime.now(pytz.UTC) + # Import only at most one month old events + cut_off_date = now - timedelta(days=31) + cut_off_date.replace(tzinfo=pytz.UTC) + end_date = event_obj.timestamp_end.replace(tzinfo=pytz.UTC) + if end_date < cut_off_date: + logger.debug("Skipping task %s. Has ended %s" % end_date) + return None + + event['start_time'] = django_timezone.make_aware(event_obj.timestamp_start, pytz.UTC) + event['end_time'] = end_date + + # Note: In Vapaaehtoistyö.fi tasks do not contain language information + lang = 'fi' + event['info_url'] = {} + event['external_links'] = {} + event['info_url'][lang] = event_obj.get_url_locale(lang) + event['external_links'][lang] = {} + + event['images'] = self._import_photo(event_obj) + event['keywords'] = self._import_keywords(event_obj) + event['location'] = self._import_location(event_obj) + + if not event['location']: + # Skip events not located in Greater Helsinki area + return None + + return event + + def _import_photo(self, event_obj): + # Note: Photo is returned as data, not as an URL. + # Note 2: There is nowhere we can store the data! + mime_type, photo_bytes = self.vetf_source.load_photo(event_obj.id) + if False and photo_bytes: + image_url = "data:%s;base64,%s" % (mime_type, base64.b64encode(photo_bytes)) + return [{ + 'url': image_url, + 'license': self.event_only_license, + }] + + return [] + + def _import_keywords(self, event_obj): + event_keywords = [] + + try: + kw = Keyword.objects.get(id="yso:%s" % self.VET_KEYWORD_ID) + except Keyword.DoesNotExist: + kw = None + if not kw: + raise RuntimeError("Fata: Cannot import Vapaaehtoistyö.fi! Missing YSO:%s keyword." % self.VET_KEYWORD_ID) + + event_keywords.append(kw) + for tag_dict in event_obj.tags: + tag_id = list(tag_dict.keys())[0] + if tag_id in self.KEYWORDS: + keyword_value = self.KEYWORDS[tag_id] + if isinstance(keyword_value, str): + keyword_value = [keyword_value] + for keyword in keyword_value: + yso_id = "yso:%s" % keyword + # logger.debug("Keyword query for: %s" % yso_id) + try: + kw = Keyword.objects.get(id=yso_id) + except Keyword.DoesNotExist: + logger.warning("Task %s has keyword %s, which maps into a non-existent %s" % ( + event_obj.id, tag_id, yso_id)) + kw = None + if kw: + event_keywords.append(kw) + logger.debug("Task %s: Got keywords: %s" % (tag_id, ', '.join([o.id for o in event_keywords]))) + + return event_keywords + + def _import_location(self, event_obj): + # DEBUG: Logging of all queries + # logging.getLogger('django.db.backends').setLevel(logging.DEBUG) + # Note: Vapaaehtoistyö.fi will return "standard" WGS 84 latitude/longtitude. + # Note 2: WGS 84 == EPSG:4326 + # Note 3: In events_place table data is stored as EPSG:3067 (aka. ETRS89 / TM35FIN(E,N)) + # See: https://epsg.io/3067 + # Note 4: PostGIS will do automatic translation from WGS 84 into EPSG:3067. + # For manual EPSG translations, see: https://epsg.io/transform + ref_location = Point(event_obj.address_coordinates['lon'], + event_obj.address_coordinates['lat'], + srid=4326) + # Query for anything within 100 meters + places = Place.objects. \ + filter(position__dwithin=(ref_location, 100.0)). \ + filter(data_source_id='osoite'). \ + annotate(distance=Distance("position", ref_location)). \ + order_by("distance")[:3] + if not places: + logger.warning("Failed to find any locations for task id %s!" % event_obj.id) + return False + + logger.debug("Got %d places, picking %s" % (len(places), places[0].id)) + # for obj in places: + # logger.debug("%s: %s, %f" % (obj.id, obj.name, obj.distance)) + + return {'id': places[0].id} + + def import_events(self): + # DEBUG: Create keywords into empty database + if False: + self._debug_create_keywords() + + logger.info("Importing Vapaaehtoistyö.fi events") + cnt_entries, event_list = self.vetf_source.load_entries() + + qs = Event.objects.filter(end_time__gte=datetime.now(), + data_source=self.importer_id, deleted=False) + + self.syncher = ModelSyncher(qs, lambda obj: obj.origin_id, delete_func=VapaaehtoistyofiImporter._mark_deleted) + + for event_obj in event_list: + event = self._import_event(event_obj) + if event: + obj = self.save_event(event) + self.syncher.mark(obj) + + self.syncher.finish(force=self.options['force']) + logger.info("%d events processed" % len(event_list)) + + @staticmethod + def _mark_deleted(obj): + if obj.deleted: + return False + obj.deleted = True + obj.save(update_fields=['deleted']) + + return True + + @db.transaction.atomic() + def _debug_create_keywords(self): + logger.info('confirming keywords...') + + ds_args = {"id": 'yso'} + defaults = {"name": 'yso'} + self.data_source, created = DataSource.objects.get_or_create(defaults=defaults, **ds_args) + if created: + logger.info('created datasource for YSO') + else: + logger.info('datasource for YSO already exist') + + for new_keyword_ext_id in self.KEYWORDS: + keyword_value = self.KEYWORDS[new_keyword_ext_id] + if isinstance(keyword_value, str): + keyword_value = [keyword_value] + for keyword in keyword_value: + yso_id = "yso:%s" % keyword + keyword_set, created = Keyword.objects.update_or_create( + id=yso_id, + defaults={ + 'id': yso_id, + 'name_fi': '', + 'data_source_id': 'yso', + } + ) + if created: + logger.info('created keyword %s' % (keyword)) + else: + logger.info('keyword %s already exist' % keyword) + logger.info('confirming keywords...') diff --git a/linkedevents/settings.py b/linkedevents/settings.py index 1d31bade8..5bd9b6e8e 100644 --- a/linkedevents/settings.py +++ b/linkedevents/settings.py @@ -504,3 +504,8 @@ def dummy_haystack_connection_for_lang(language_code): # this is relevant for the fulltext search as implemented in _filter_event_queryset() FULLTEXT_SEARCH_LANGUAGES = {'fi':'finnish', 'sv':'swedish', 'en':'english'} + +# +# Vapaaehtoistyö.fi +# +VAPAAEHTOISTYOFI_API_KEY = env('VAPAAEHTOISTYOFI_API_KEY') From 7ad229964b28d39bf6449281d178ded7e60e4d2f Mon Sep 17 00:00:00 2001 From: Jari Turkia Date: Thu, 29 Apr 2021 15:01:21 +0300 Subject: [PATCH 4/8] =?UTF-8?q?Bugfix:=20If=20Vapaaehtoisty=C3=B6.fi=20API?= =?UTF-8?q?-key=20is=20missing,=20don't=20fail.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- events/importer/helper/importers/vapaaehtoistyofi/reader.py | 2 ++ linkedevents/settings.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/events/importer/helper/importers/vapaaehtoistyofi/reader.py b/events/importer/helper/importers/vapaaehtoistyofi/reader.py index 322673a43..b3939ae3a 100644 --- a/events/importer/helper/importers/vapaaehtoistyofi/reader.py +++ b/events/importer/helper/importers/vapaaehtoistyofi/reader.py @@ -12,6 +12,8 @@ class Reader: cached_entries = True def __init__(self, api_key): + if not api_key: + raise ValueError("Really need API-key!") self.api_key = api_key if self.cached_entries: diff --git a/linkedevents/settings.py b/linkedevents/settings.py index 5bd9b6e8e..8dc1df957 100644 --- a/linkedevents/settings.py +++ b/linkedevents/settings.py @@ -508,4 +508,4 @@ def dummy_haystack_connection_for_lang(language_code): # # Vapaaehtoistyö.fi # -VAPAAEHTOISTYOFI_API_KEY = env('VAPAAEHTOISTYOFI_API_KEY') +VAPAAEHTOISTYOFI_API_KEY = env('VAPAAEHTOISTYOFI_API_KEY', default=None) From 09461dbea1351cceaac7f453191606f01c6a705a Mon Sep 17 00:00:00 2001 From: Jari Turkia Date: Thu, 29 Apr 2021 15:07:56 +0300 Subject: [PATCH 5/8] Added missing migration 0079 from feature/improve_search_performance to make migration succeed on Travis --- events/migrations/0079_add_search_vectors.py | 33 ++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 events/migrations/0079_add_search_vectors.py diff --git a/events/migrations/0079_add_search_vectors.py b/events/migrations/0079_add_search_vectors.py new file mode 100644 index 000000000..9faf8c8b8 --- /dev/null +++ b/events/migrations/0079_add_search_vectors.py @@ -0,0 +1,33 @@ +# Generated by Django 2.2.13 on 2021-04-12 06:38 + +import django.contrib.postgres.search +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('events', '0078_add_data_source_past_events'), + ] + + operations = [ + migrations.AddField( + model_name='event', + name='search_vector_en', + field=django.contrib.postgres.search.SearchVectorField(null=True), + ), + migrations.AddField( + model_name='event', + name='search_vector_fi', + field=django.contrib.postgres.search.SearchVectorField(null=True), + ), + migrations.AddField( + model_name='event', + name='search_vector_sv', + field=django.contrib.postgres.search.SearchVectorField(null=True), + ), + migrations.AddIndex( + model_name='keyword', + index=models.Index(condition=models.Q(n_events__gt=0), fields=['name', 'name_fi'], name='keywords_index'), + ), + ] From 57a31f2cb7e0d457cb331a353a7b5393bf9a1bed Mon Sep 17 00:00:00 2001 From: Jari Turkia Date: Thu, 29 Apr 2021 15:46:13 +0300 Subject: [PATCH 6/8] Minor fixes in code to make flake8 pass --- .../importers/vapaaehtoistyofi/__init__.py | 2 ++ .../helper/importers/vapaaehtoistyofi/reader.py | 6 +++--- events/importer/vapaaehtoistyofi.py | 16 +++++++++------- 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/events/importer/helper/importers/vapaaehtoistyofi/__init__.py b/events/importer/helper/importers/vapaaehtoistyofi/__init__.py index 8f5bbc9f7..e65efa1c9 100644 --- a/events/importer/helper/importers/vapaaehtoistyofi/__init__.py +++ b/events/importer/helper/importers/vapaaehtoistyofi/__init__.py @@ -1,2 +1,4 @@ from .reader import Reader from .record import Record + +__all__ = ['Reader', 'Record'] diff --git a/events/importer/helper/importers/vapaaehtoistyofi/reader.py b/events/importer/helper/importers/vapaaehtoistyofi/reader.py index b3939ae3a..2fc9a348a 100644 --- a/events/importer/helper/importers/vapaaehtoistyofi/reader.py +++ b/events/importer/helper/importers/vapaaehtoistyofi/reader.py @@ -39,7 +39,7 @@ def load_entry(self, id): if not self.cached_entries: return self._load_entry_api(id) - if not id in self.entries: + if id not in self.entries: return False return self.entries[id] @@ -106,8 +106,8 @@ def load_photo(self, id): if 'status' not in data or data['status'] != "ok": raise RuntimeError("Vapaaehtoistyö.fi response isn't ok!") if 'data' not in data: - #log.error("%s: %s" % (id, data)) - #raise RuntimeError("Vapaaehtoistyö.fi response doesn't contain 'data'!" + # log.error("%s: %s" % (id, data)) + # raise RuntimeError("Vapaaehtoistyö.fi response doesn't contain 'data'!" log.warning("Requested photo for %s, but didn't receive any data!" % id) return None, None if data['data']['photo']['type'] != "Buffer": diff --git a/events/importer/vapaaehtoistyofi.py b/events/importer/vapaaehtoistyofi.py index ffca42ffe..d0d02190d 100644 --- a/events/importer/vapaaehtoistyofi.py +++ b/events/importer/vapaaehtoistyofi.py @@ -5,7 +5,7 @@ from datetime import datetime, timedelta from django import db from django.conf import settings -from django.contrib.gis.geos import Point, GEOSGeometry +from django.contrib.gis.geos import Point from django.contrib.gis.db.models.functions import Distance from django.core.management import call_command from django_orghierarchy.models import Organization @@ -23,7 +23,7 @@ License ) from .sync import ModelSyncher -from .base import Importer, recur_dict, register_importer +from .base import Importer, register_importer from .util import clean_text from events.importer.helper.importers import vapaaehtoistyofi @@ -224,11 +224,13 @@ def _import_location(self, event_obj): event_obj.address_coordinates['lat'], srid=4326) # Query for anything within 100 meters - places = Place.objects. \ - filter(position__dwithin=(ref_location, 100.0)). \ - filter(data_source_id='osoite'). \ - annotate(distance=Distance("position", ref_location)). \ - order_by("distance")[:3] + # Note: flake8 doesn't allow this to be formatted in a readable way :-( + places = Place.objects.filter( + position__dwithin=(ref_location, 100.0)).filter( + data_source_id='osoite').annotate( + distance=Distance( + "position", ref_location)).order_by( + "distance")[:3] if not places: logger.warning("Failed to find any locations for task id %s!" % event_obj.id) return False From 76a49a23702d62219ecec92c560b888d6dc66ffe Mon Sep 17 00:00:00 2001 From: Jari Turkia Date: Fri, 7 May 2021 15:22:48 +0300 Subject: [PATCH 7/8] Simplified Record structure --- .../importers/vapaaehtoistyofi/record.py | 147 +++++------------- events/importer/vapaaehtoistyofi.py | 6 +- 2 files changed, 39 insertions(+), 114 deletions(-) diff --git a/events/importer/helper/importers/vapaaehtoistyofi/record.py b/events/importer/helper/importers/vapaaehtoistyofi/record.py index 4c9c45834..c76c386bb 100644 --- a/events/importer/helper/importers/vapaaehtoistyofi/record.py +++ b/events/importer/helper/importers/vapaaehtoistyofi/record.py @@ -14,122 +14,47 @@ class Record: TASK_URL_FORMAT = 'https://vapaaehtoistyo.fi/%s/task/%s' def __init__(self, json_dict): - if json_dict: - self.id = json_dict['id'] - self.organization_id = json_dict['organization'] - self.organization_name = json_dict['organizationName'] - self.title = json_dict['title'] - self.address = json_dict['address'] - self.address_coordinates = { - 'lat': json_dict['addressCoordinates']['lat'], - 'lon': json_dict['addressCoordinates']['lng'] - } - self.tags = [] - for theme in json_dict['themes']: - theme_id = theme["id"] - theme_name = theme["name"] - self.tags.append({theme_id: theme_name}) - - self.timestamp_start = datetime.datetime.utcfromtimestamp(json_dict['timeStampStartTask']) - self.timestamp_end = datetime.datetime.utcfromtimestamp(json_dict['timeStampEndTask']) - if json_dict['noActualTime']: - self.no_time = True - else: - self.no_time = False - self.description = json_dict['description'] - self.contact_details = json_dict['contactDetails'] - self.themes = json_dict['id'] - self.timestamp_publish = datetime.datetime.utcfromtimestamp(json_dict['timeStampPublicationDate']) - # self.publicationTime = json_dict['id'] - if json_dict['status'] in self.STATUS: - self.status = json_dict['id'] - else: - raise RuntimeError("Unknown status %d!" % json_dict['status']) - self.creator_id = json_dict['creator'] - if json_dict['timeStampAdded']: - self.timestamp_inserted = datetime.datetime.utcfromtimestamp(json_dict['timeStampAdded']) - else: - self.timestamp_inserted = None - if json_dict['timeStampLastUpdated']: - self.timestamp_updated = datetime.datetime.utcfromtimestamp(json_dict['timeStampLastUpdated']) - else: - self.timestamp_updated = None + self.id = json_dict['id'] + self.organization_id = json_dict['organization'] + self.organization_name = json_dict['organizationName'] + self.title = json_dict['title'] + self.address = json_dict['address'] + self.address_coordinates = { + 'lat': json_dict['addressCoordinates']['lat'], + 'lon': json_dict['addressCoordinates']['lng'] + } + self.tags = [] + for theme in json_dict['themes']: + theme_id = theme["id"] + theme_name = theme["name"] + self.tags.append({theme_id: theme_name}) + + self.timestamp_start = datetime.datetime.utcfromtimestamp(json_dict['timeStampStartTask']) + self.timestamp_end = datetime.datetime.utcfromtimestamp(json_dict['timeStampEndTask']) + if json_dict['noActualTime']: + self.no_time = True + else: + self.no_time = False + self.description = json_dict['description'] + self.contact_details = json_dict['contactDetails'] + self.themes = json_dict['id'] + self.timestamp_publish = datetime.datetime.utcfromtimestamp(json_dict['timeStampPublicationDate']) + # self.publicationTime = json_dict['id'] + if json_dict['status'] in self.STATUS: + self.status = json_dict['id'] + else: + raise RuntimeError("Unknown status %d!" % json_dict['status']) + self.creator_id = json_dict['creator'] + if json_dict['timeStampAdded']: + self.timestamp_inserted = datetime.datetime.utcfromtimestamp(json_dict['timeStampAdded']) else: - self.id = None - self.organization_id = None - self.organization_name = None - self.title = None - self.address = None - self.address_coordinates = None - self.tags = [] - - self.no_time = None - self.timestamp_start = None - self.timestamp_end = None - - self.description = None - self.contact_details = None - self.themes = None - self.timestamp_publish = None - - self.status = None - self.creator_id = None self.timestamp_inserted = None + if json_dict['timeStampLastUpdated']: + self.timestamp_updated = datetime.datetime.utcfromtimestamp(json_dict['timeStampLastUpdated']) + else: self.timestamp_updated = None def get_url_locale(self, locale): if locale not in self.LOCALE: raise ValueError("Unknown locale '%s'!" % locale) return self.TASK_URL_FORMAT % (locale, self.id) - - def __copy__(self) -> object: - newrecord = Record(None) - newrecord.id = self.id - newrecord.organization_id = self.organization_id - newrecord.organization_name = self.organization_name - newrecord.title = self.title - newrecord.address = self.address - newrecord.address_coordinates = self.address_coordinates - newrecord.tags = self.tags - - newrecord.no_time = self.no_time - newrecord.timestamp_start = self.timestamp_start - newrecord.timestamp_end = self.timestamp_end - - newrecord.description = self.description - newrecord.contact_details = self.contact_details - newrecord.themes = self.themes - newrecord.timestamp_publish = self.timestamp_publish - - newrecord.status = self.status - newrecord.creator_id = self.creator_id - newrecord.timestamp_inserted = self.timestamp_inserted - newrecord.timestamp_updated = self.timestamp_updated - - return newrecord - - def to_dict(self): - newrecord = { - "id": self.id, - "organization_id": self.organization_id, - "organization_name": self.organization_name, - "title": self.title, - "address": self.address, - "address_coordinates": self.address_coordinates, - - "no_time": self.no_time, - "timestamp_start": self.timestamp_start, - "timestamp_end": self.timestamp_end, - - "description": self.description, - "contact_details": self.contact_details, - "themes": self.themes, - "timestamp_publish": self.timestamp_publish, - - "status": self.status, - "creator_id": self.creator_id, - "timestamp_inserted": self.timestamp_inserted, - "timestamp_updated": self.timestamp_updated - } - - return newrecord diff --git a/events/importer/vapaaehtoistyofi.py b/events/importer/vapaaehtoistyofi.py index d0d02190d..98132032a 100644 --- a/events/importer/vapaaehtoistyofi.py +++ b/events/importer/vapaaehtoistyofi.py @@ -118,7 +118,7 @@ def check_deleted(self, obj): return obj.deleted def _import_event(self, event_obj): - event = event_obj.to_dict() + event = dict(event_obj.__dict__) logger.debug("Task id %s" % event_obj.id) event['id'] = '%s:%s' % (self.data_source.id, event_obj.id) event['origin_id'] = event_obj.id @@ -143,7 +143,7 @@ def _import_event(self, event_obj): cut_off_date.replace(tzinfo=pytz.UTC) end_date = event_obj.timestamp_end.replace(tzinfo=pytz.UTC) if end_date < cut_off_date: - logger.debug("Skipping task %s. Has ended %s" % end_date) + logger.debug("Skipping task %s. Has ended %s" % (event_obj.id, end_date)) return None event['start_time'] = django_timezone.make_aware(event_obj.timestamp_start, pytz.UTC) @@ -207,7 +207,7 @@ def _import_keywords(self, event_obj): kw = None if kw: event_keywords.append(kw) - logger.debug("Task %s: Got keywords: %s" % (tag_id, ', '.join([o.id for o in event_keywords]))) + logger.debug("Task %s: Got keywords: %s" % (event_obj.id, ', '.join([o.id for o in event_keywords]))) return event_keywords From 6bd3a2842dbdadbe7f6ad2773834d980b0402fbd Mon Sep 17 00:00:00 2001 From: Jari Turkia Date: Fri, 7 May 2021 15:42:56 +0300 Subject: [PATCH 8/8] Started using new endpoint for single tasks. Added proper session support into HTTP-client. --- .../importers/vapaaehtoistyofi/reader.py | 49 +++++++------------ 1 file changed, 19 insertions(+), 30 deletions(-) diff --git a/events/importer/helper/importers/vapaaehtoistyofi/reader.py b/events/importer/helper/importers/vapaaehtoistyofi/reader.py index 2fc9a348a..8e1dfe5b4 100644 --- a/events/importer/helper/importers/vapaaehtoistyofi/reader.py +++ b/events/importer/helper/importers/vapaaehtoistyofi/reader.py @@ -9,19 +9,12 @@ class Reader: endpoint_url = 'https://apiv2.vapaaehtoistyo.fi' rest_user_agent = 'HelsinkiVETImporter/0.1' timeout = 5.0 - cached_entries = True def __init__(self, api_key): if not api_key: raise ValueError("Really need API-key!") self.api_key = api_key - - if self.cached_entries: - cnt_entries, data = self.load_entries() - self.entries = {} - for record in data: - id = record.id - self.entries[id] = record + self.http_client = None def _setup_client(self): headers = { @@ -35,50 +28,48 @@ def _setup_client(self): return s - def load_entry(self, id): - if not self.cached_entries: - return self._load_entry_api(id) + def _get_client(self): + if self.http_client: + return self.http_client - if id not in self.entries: - return False + self.http_client = self._setup_client() - return self.entries[id] + return self.http_client - def _load_entry_api(self, id): - http_client = self._setup_client() - url = "%s/task/%s" % (self.endpoint_url, id) - response = http_client.get(url, timeout=self.timeout) + @staticmethod + def _load_status_check(data): + if 'status' not in data or data['status'] != "ok": + raise RuntimeError("Vapaaehtoistyö.fi response isn't ok!") + if 'data' not in data: + raise RuntimeError("Vapaaehtoistyö.fi response doesn't contain 'data'!") + + def load_entry(self, event_id): + url = "%s/task/%s" % (self.endpoint_url, event_id) + response = self._get_client().get(url, timeout=self.timeout) if response.status_code != 200: raise RuntimeError("Failed to request data from Vapaaehtoistyö.fi API! HTTP/%d" % response.status_code) data = response.json() - if 'status' not in data or data['status'] != "ok": - raise RuntimeError("Vapaaehtoistyö.fi response isn't ok!") - if 'data' not in data: - raise RuntimeError("Vapaaehtoistyö.fi response doesn't contain 'data'!") + self._load_status_check(data) data_obj = Record(data) return data_obj def load_entries(self): - http_client = self._setup_client() page = 1 total_records = None batch_size = 1 ret = [] while batch_size: url = "%s/collection/task?page=%d" % (self.endpoint_url, page) - response = http_client.get(url, timeout=self.timeout) + response = self._get_client().get(url, timeout=self.timeout) if response.status_code != 200: raise RuntimeError("Failed to request data from Vapaaehtoistyö.fi API! HTTP/%d" % response.status_code) data = response.json() - if 'status' not in data or data['status'] != "ok": - raise RuntimeError("Vapaaehtoistyö.fi response isn't ok!") - if 'data' not in data: - raise RuntimeError("Vapaaehtoistyö.fi response doesn't contain 'data'!") + self._load_status_check(data) if not total_records: total_records = int(data['data']['totalRecords']) @@ -106,8 +97,6 @@ def load_photo(self, id): if 'status' not in data or data['status'] != "ok": raise RuntimeError("Vapaaehtoistyö.fi response isn't ok!") if 'data' not in data: - # log.error("%s: %s" % (id, data)) - # raise RuntimeError("Vapaaehtoistyö.fi response doesn't contain 'data'!" log.warning("Requested photo for %s, but didn't receive any data!" % id) return None, None if data['data']['photo']['type'] != "Buffer":