Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions src/etls/dgt/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Web principal

[Web principal de la Dirección General de Tributos](https://www.hacienda.gob.es/es-ES/Areas%20Tematicas/Impuestos/Direccion%20General%20de%20Tributos/Paginas/Direccion%20general%20de%20tributos.aspx)


# Doctrina

[Doctrina de la DGT](https://www.hacienda.gob.es/es-ES/Normativa%20y%20doctrina/Doctrina/Paginas/default.aspx)

# Portal de búsqueda

[Portal de búsqueda](https://petete.tributos.hacienda.gob.es/consultas/)

# Ejemplo de documentos scrapeados

[Doc1](https://petete.tributos.hacienda.gob.es/consultas/do/document?doc=1&tab=1)
[Doc2](https://petete.tributos.hacienda.gob.es/consultas/do/document?doc=64247&tab=2)

# Fechas a scrappear

La primera consulta general se publicó el 03/01/1997.
La primera consulta vinculante se publicó el 29/07/1997.

# Frecuencia de actualización de la base de datos
??

Empty file added src/etls/dgt/__init__.py
Empty file.
1 change: 1 addition & 0 deletions src/etls/dgt/defs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
COLLECTION_NAME = "dgt"
60 changes: 60 additions & 0 deletions src/etls/dgt/load.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
from datetime import date, datetime, timedelta

import typer

from src.email.send_email import send_email
from src.etls.dgt.scrapper import DGTScrapper
from src.etls.dgt.defs import COLLECTION_NAME
from src.etls.common.etl import ETL
from src.initialize import initialize_app


app = typer.Typer()


@app.command()
def today(init_objects=None):
if init_objects is None:
init_objects = initialize_app()
etl_job = ETL(config_loader=init_objects.config_loader, vector_store=init_objects.vector_store[COLLECTION_NAME])
dgt_scrapper = DGTScrapper()
day = date.today()
docs = dgt_scrapper.download_day(day)
if docs:
etl_job.run(docs)

subject = "[DGT] Daily ETL executed"
content = f"""
Daily ETL executed
- Date: {day}
- Documents loaded: {len(docs)}
- Database used: {init_objects.config_loader['vector_store']}
"""
send_email(init_objects.config_loader, subject, content)


@app.command()
def dates(date_start: str, date_end: str, init_objects=None):
if init_objects is None:
init_objects = initialize_app()
etl_job = ETL(config_loader=init_objects.config_loader, vector_store=init_objects.vector_store[COLLECTION_NAME])
dgt_scrapper = DGTScrapper()
docs = dgt_scrapper.download_days(
date_start=datetime.strptime(date_start, "%Y/%m/%d").date(),
date_end=datetime.strptime(date_end, "%Y/%m/%d").date(),
)
if docs:
etl_job.run(docs)

subject = "[DGT] Load ETL executed"
content = f"""
Load ETL executed
- Date start: {date_start}
- Date end: {date_end}
- Documents loaded: {len(docs)}
- Database used: {init_objects.config_loader['vector_store']}
"""
send_email(init_objects.config_loader, subject, content)

if __name__ == "__main__":
app()
36 changes: 36 additions & 0 deletions src/etls/dgt/metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from datetime import datetime
from pydantic import field_validator

from src.etls.common.metadata import MetadataDocument


class DGTMetadataDocument(MetadataDocument):
"""Class for keeping metadata of a DGT Document scrapped."""

# Text
filepath: str

# Source
source_name: str = "DGT"
source_type: str

# Metadatos
identificador: str
numero_consulta: str
organo: str
normativa: str = ""
url_html: str
fecha_publicacion: str
fecha_disposicion: str = ""
anio: str
mes: str
dia: str

datetime_insert: str = datetime.utcnow().isoformat()

@field_validator("fecha_publicacion", "fecha_disposicion")
@classmethod
def isoformat(cls, v):
if v:
datetime.strptime(v, "%Y-%m-%d")
return v
180 changes: 180 additions & 0 deletions src/etls/dgt/scrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
import re
import requests
import tempfile
import typing as tp
import logging as lg
from bs4 import BeautifulSoup
from urllib.parse import urlencode
from datetime import date, datetime
from requests.exceptions import HTTPError

from src.etls.common.scrapper import BaseScrapper
from src.etls.dgt.metadata import DGTMetadataDocument
from src.etls.dgt.utils import SEARCH_POST, DOC_POST, HEADERS, TARGET_CLASSES

initialize_logging()

def _extract_target_class(soup: BeautifulSoup, target_class: str) -> str:
"""
Extracts the text of the next sibling for a span element that contains the specified text_label.

:param row: The BeautifulSoup row element to search within.
:param regex: The regular expresion to search for within the span element.
:return: The stripped text of the next sibling if found, otherwise an empty string.
"""

num_consulta_value = ""
# Extraer la información deseada
num_consulta_tag = soup.find('tr', class_=target_class)
num_consulta_value_ = num_consulta_tag.find_all('p', class_=target_class)
for v in num_consulta_value_:
num_consulta_value += v.get_text(separator='\n', strip=True)

return num_consulta_value #num_consulta_label

def _extract_metadata(soup) -> tp.Dict:
metadata_dict = {}

metadata_dict['source_type'] = soup.find('div', class_="doc_header").contents[-1].strip()

# Metadatos
if numero_consulta := _extract_target_class(soup, "NUM-CONSULTA"):
metadata_dict["numero_consulta"] = numero_consulta

if organo := _extract_target_class(soup, "ORGANO"):
metadata_dict["organo"] = organo

if normativa := _extract_target_class(soup, "NORMATIVA"):
metadata_dict["normativa"] = normativa

if fecha_publicacion := _extract_target_class(soup, "FECHA-SALIDA"):
fecha_publicacion = datetime.strptime(fecha_publicacion, "%d/%m/%Y").strftime("%Y-%m-%d")
metadata_dict["fecha_publicacion"] = fecha_publicacion
metadata_dict["fecha_disposicion"] = fecha_publicacion
metadata_dict["anio"] = str(datetime.strptime(fecha_publicacion, "%Y-%m-%d").year)
metadata_dict["mes"] = str(datetime.strptime(fecha_publicacion, "%Y-%m-%d").month)
metadata_dict["dia"] = str(datetime.strptime(fecha_publicacion, "%Y-%m-%d").day)

return metadata_dict

def _extract_text(soup: BeautifulSoup, target_classes: str) -> str:
"""
Extracts text from HTML elements with specific classes. Iterate over
each class, look for elements with that class and check if they are root
elements (not nested). If it is a root element, it adds its text to the
result.
"""
extracted_text = ""
for class_name in target_classes:
for element in soup.find_all(class_=class_name):
parent_with_same_class = element.find_parent(class_=class_name)
if parent_with_same_class is None:
extracted_text += element.get_text(separator='\n', strip=True) + "\n\n"
return extracted_text


def _list_links_day(url: str, day_str: str) -> tp.List[BeautifulSoup]:
"""Get a list of documents listed in a DGT url day

:param url: url base link. Example: 'https://petete.tributos.hacienda.gob.es/consultas/do/search'
:param day_str: str date to scrap
:return: return: list of id documents to explore
"""
logger = lg.getLogger(_list_links_day.__name__)
logger.info("Scrapping day: %s", day_str)

SEARCH_POST['dateIni_2'] = day_str
SEARCH_POST['dateEnd_2'] = day_str
SEARCH_POST['VLCMP_2'] = day_str + '..' + day_str

extracted_docs = []
for tab in range(1,3):
SEARCH_POST['tab'] = tab
response = requests.post(url, data=SEARCH_POST, headers=HEADERS, verify=False) # Omitir verificación SSL
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")

if "La consulta realizada no devuelve resultados" in soup.text:
pass
else:
# Extract total pages and current page
total_pages = int(soup.find('span', id='total_pages').text)

for page in range(1, total_pages+1):
SEARCH_POST['page'] = str(page)
response = requests.post(url, data=SEARCH_POST, headers=HEADERS, verify=False) # Omitir verificación SSL
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")

# Find all the docs in the response which correspond to published enquiries
# Use a regular expression to match 'id' attributes starting with 'doc_'
doc_ids = soup.find_all('td', id=re.compile('^doc_'))

# Extract the 'id' attribute from each matching tag
current_extracted_docs = [(doc['id'].split('_')[1], tab) for doc in doc_ids]
extracted_docs += current_extracted_docs

logger.info("Scrapped day successfully %s (%s DGT documents)", url, len(extracted_docs))

return extracted_docs

class DGTScrapper(BaseScrapper):
def download_day(self, day: date) -> tp.List[DGTMetadataDocument]:
"""Download all the documents for a specific date."""
logger = lg.getLogger(self.download_day.__name__)
logger.info("Downloading DGT content for day %s", day)
day_str = day.strftime("%d/%m/%Y")
url_search = "https://petete.tributos.hacienda.gob.es/consultas/do/search"
url = "https://petete.tributos.hacienda.gob.es/consultas/do/document"
metadata_documents = []
try:
docs = _list_links_day(url_search, day_str)
for id_doc, tab in docs:
try:
DOC_POST['doc'] = id_doc
DOC_POST['tab'] = tab
encoded_params = urlencode(DOC_POST)
url_document = f"{url}?{encoded_params}"
metadata_doc = self.download_document(url_document)
if metadata_doc != None:
metadata_documents.append(metadata_doc)
else:
logger.info("No data found in document %s", url)
except HTTPError:
logger.error("Not scrapped document %s on day %s", url_document, day_str)
except AttributeError:
logger.error("Not scrapped document %s on day %s", url_document, day_str)
except HTTPError:
logger.error("Not scrapped document on day %s", day_str)
logger.info("Downloaded DGT content for day %s", day_str)
return metadata_documents

def download_document(self, url: str) -> DGTMetadataDocument:
"""Get text and metadata from a DGT document.

:param url: document url link. Examples:
* https://petete.tributos.hacienda.gob.es/consultas/do/document?doc=64316&tab=2
* https://petete.tributos.hacienda.gob.es/consultas/do/document?doc=1&tab=1
:return: document with metadata and filepath with text content
"""
logger = lg.getLogger(self.download_document.__name__)
logger.info("Scrapping document: %s", url)
response = requests.get(url, headers=HEADERS, verify=False)
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")

extracted_text = _extract_text(soup, TARGET_CLASSES)

# Check if enquiry has content
if "Contestación completa" not in extracted_text:
logger.info("Scrapped document is empty: %s", url)
return

with tempfile.NamedTemporaryFile("w", delete=False) as fn:
fn.write(extracted_text)
metadata_dict = _extract_metadata(soup)
metadata_dict["identificador"] = (url.split("=")[1]).split("&")[0]
metadata_dict["url_html"] = url
metadata_doc = DGTMetadataDocument(filepath=fn.name, **metadata_dict)
logger.info("Scrapped document successfully %s", url)
return metadata_doc
65 changes: 65 additions & 0 deletions src/etls/dgt/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
from datetime import datetime, timedelta

# POST request data to do the search in DGT documents
SEARCH_POST = {
'type1': 'on',
'type2': 'on',
'NMCMP_1': 'NUM-CONSULTA',
'VLCMP_1': '',
'OPCMP_1': '.Y',
'NMCMP_2': 'FECHA-SALIDA',
'VLCMP_2': '',
'dateIni_2': '',
'dateEnd_2': '',
'OPCMP_2': '.Y',
'NMCMP_3': 'NORMATIVA',
'VLCMP_3': '',
'OPCMP_3': '.Y',
'NMCMP_4': 'CUESTION-PLANTEADA',
'VLCMP_4': '',
'OPCMP_4': '.Y',
'NMCMP_5': 'DESCRIPCION-HECHOS',
'VLCMP_5': '',
'OPCMP_5': '.Y',
'NMCMP_6': 'FreeText',
'VLCMP_6': '',
'OPCMP_6': '.Y',
'NMCMP_7': 'CRITERIO',
'cmpOrder': 'NUM-CONSULTA',
'dirOrder': '0',
'auto': '',
'tab': '2',
'page': '1'
}


# POST request explore data in DGT search
DOC_POST = {
'doc': '', # doc_id
'tab': ''
}

# HTTP headers from request
HEADERS = {
'Referer': 'https://petete.tributos.hacienda.gob.es/consultas',
'X-Requested-With': 'XMLHttpRequest',
}

# Target classes for extracting text
# ["NUM-CONSULTA", "ORGANO", "FECHA-SALIDA", "NORMATIVA", "DESCRIPCION-HECHOS",
# "CUESTION-PLANTEADA", "CONTESTACION-COMPL"]
TARGET_CLASSES = ["NORMATIVA", "DESCRIPCION-HECHOS", "CUESTION-PLANTEADA", "CONTESTACION-COMPL"]

def get_previous_month_dates(t_date=None):
"""
Get the start date and end date of the previous month.
"""
if t_date == None:
t_date = datetime.now()
first_day_previous_month = datetime(t_date.year, t_date.month, 1) - timedelta(days=1)
first_day_previous_month = datetime(first_day_previous_month.year, first_day_previous_month.month, 1)
last_day_previous_month = datetime(t_date.year, t_date.month, 1) - timedelta(days=1)

return first_day_previous_month, last_day_previous_month