# -*- coding: utf-8 -*-
###########################################################################
# Copyright (c), The AiiDA team. All rights reserved. #
# This file is part of the AiiDA code. #
# #
# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core #
# For further information on the license, see the LICENSE.txt file #
# For further information please visit http://www.aiida.net #
###########################################################################
"""Base abstract Backup class for all backends."""
import datetime
import os
import logging
import shutil
from abc import ABC, abstractmethod
from dateutil.parser import parse
from aiida.common import json
from aiida.common import timezone as dtimezone
[docs]class AbstractBackup(ABC):
"""
This class handles the backup of the AiiDA repository that is referenced
by the current AiiDA database. The backup will start from the
given backup timestamp (*oldest_object_backedup*) or the date of the
oldest node/workflow object found and it will periodically backup
(in periods of *periodicity* days) until the ending date of the backup
specified by *end_date_of_backup* or *days_to_backup*.
"""
# Keys in the dictionary loaded by the JSON file
OLDEST_OBJECT_BK_KEY = 'oldest_object_backedup'
BACKUP_DIR_KEY = 'backup_dir'
DAYS_TO_BACKUP_KEY = 'days_to_backup'
END_DATE_OF_BACKUP_KEY = 'end_date_of_backup'
PERIODICITY_KEY = 'periodicity'
BACKUP_LENGTH_THRESHOLD_KEY = 'backup_length_threshold'
# Backup parameters that will be populated by the JSON file
# Where did the last backup stop
_oldest_object_bk = None
# The destination directory of the backup
_backup_dir = None
# How many days to backup
_days_to_backup = None
# Until what date we should backup
_end_date_of_backup = None
# How many consecutive days to backup in one round.
_periodicity = None
# The threshold (in hours) between the oldest object to be backed up
# and the end of the backup. If the difference is bellow this threshold
# the backup should not start.
_backup_length_threshold = None
# The end of the backup dates (or days) until the end are translated to
# the following internal variable containing the end date
_internal_end_date_of_backup = None
_additional_back_time_mins = None
_ignore_backup_dir_existence_check = False # pylint: disable=invalid-name
[docs] def __init__(self, backup_info_filepath, additional_back_time_mins):
# The path to the JSON file with the backup information
self._backup_info_filepath = backup_info_filepath
self._additional_back_time_mins = additional_back_time_mins
# Configuring the logging
logging.basicConfig(format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s')
# The logger of the backup script
self._logger = logging.getLogger('aiida.aiida_backup')
[docs] def _read_backup_info_from_file(self, backup_info_file_name):
"""
This method reads the backup information from the given file and
passes the dictionary to the method responsible for the initialization
of the needed class variables.
"""
backup_variables = None
with open(backup_info_file_name, 'r', encoding='utf8') as backup_info_file:
try:
backup_variables = json.load(backup_info_file)
except ValueError:
self._logger.error('Could not parse file %s', backup_info_file_name)
raise BackupError('Could not parse file ' + backup_info_file_name)
self._read_backup_info_from_dict(backup_variables)
[docs] def _read_backup_info_from_dict(self, backup_variables): # pylint: disable=too-many-branches,too-many-statements
"""
This method reads the backup information from the given dictionary and
sets the needed class variables.
"""
# Setting the oldest backup date. This will be used as start of
# the new backup procedure.
#
# If the oldest backup date is not set, then find the oldest
# creation timestamp and set it as the oldest backup date.
if backup_variables.get(self.OLDEST_OBJECT_BK_KEY) is None:
query_node_res = self._query_first_node()
if not query_node_res:
self._logger.error('The oldest modification date was not found.')
raise BackupError('The oldest modification date was not found.')
oldest_timestamps = []
if query_node_res:
oldest_timestamps.append(query_node_res[0].ctime)
self._oldest_object_bk = min(oldest_timestamps)
self._logger.info(
'Setting the oldest modification date to the creation date of the oldest object '
'(%s)', self._oldest_object_bk
)
# If the oldest backup date is not None then try to parse it
else:
try:
self._oldest_object_bk = parse(backup_variables.get(self.OLDEST_OBJECT_BK_KEY))
if self._oldest_object_bk.tzinfo is None:
curr_timezone = dtimezone.get_current_timezone()
self._oldest_object_bk = dtimezone.get_current_timezone().localize(self._oldest_object_bk)
self._logger.info(
'No timezone defined in the oldest modification date timestamp. Setting current timezone (%s).',
curr_timezone.zone
)
# If it is not parsable...
except ValueError:
self._logger.error('We did not manage to parse the start timestamp of the last backup.')
raise
# Setting the backup directory & normalizing it
self._backup_dir = os.path.normpath(backup_variables.get(self.BACKUP_DIR_KEY))
if (not self._ignore_backup_dir_existence_check and not os.path.isdir(self._backup_dir)):
self._logger.error('The given backup directory does not exist.')
raise BackupError('The given backup directory does not exist.')
# You can not set an end-of-backup date and end days from the backup
# that you should stop.
if (
backup_variables.get(self.DAYS_TO_BACKUP_KEY) is not None and
backup_variables.get(self.END_DATE_OF_BACKUP_KEY) is not None
):
self._logger.error('Only one end of backup date can be set.')
raise BackupError('Only one backup end can be set (date or days from backup start.')
# Check if there is an end-of-backup date
elif backup_variables.get(self.END_DATE_OF_BACKUP_KEY) is not None:
try:
self._end_date_of_backup = parse(backup_variables.get(self.END_DATE_OF_BACKUP_KEY))
if self._end_date_of_backup.tzinfo is None:
curr_timezone = dtimezone.get_current_timezone()
self._end_date_of_backup = \
curr_timezone.localize(
self._end_date_of_backup)
self._logger.info(
'No timezone defined in the end date of backup timestamp. Setting current timezone (%s).',
curr_timezone.zone
)
self._internal_end_date_of_backup = self._end_date_of_backup
except ValueError:
self._logger.error('The end date of the backup could not be parsed correctly')
raise
# Check if there is defined a days to backup
elif backup_variables.get(self.DAYS_TO_BACKUP_KEY) is not None:
try:
self._days_to_backup = int(backup_variables.get(self.DAYS_TO_BACKUP_KEY))
self._internal_end_date_of_backup = (
self._oldest_object_bk + datetime.timedelta(days=self._days_to_backup)
)
except ValueError:
self._logger.error('The days to backup should be an integer')
raise
# If the backup end is not set, then the ending date remains open
# Parse the backup periodicity.
try:
self._periodicity = int(backup_variables.get(self.PERIODICITY_KEY))
except ValueError:
self._logger.error('The backup _periodicity should be an integer')
raise
# Parse the backup length threshold
try:
hours_th = int(backup_variables.get(self.BACKUP_LENGTH_THRESHOLD_KEY))
self._backup_length_threshold = datetime.timedelta(hours=hours_th)
except ValueError:
self._logger.error('The backup length threshold should be an integer')
raise
[docs] def _dictionarize_backup_info(self):
"""
This dictionarises the backup information and returns the dictionary.
"""
backup_variables = {
self.OLDEST_OBJECT_BK_KEY: str(self._oldest_object_bk),
self.BACKUP_DIR_KEY: self._backup_dir,
self.DAYS_TO_BACKUP_KEY: self._days_to_backup,
self.END_DATE_OF_BACKUP_KEY: None if self._end_date_of_backup is None else str(self._end_date_of_backup),
self.PERIODICITY_KEY: self._periodicity,
self.BACKUP_LENGTH_THRESHOLD_KEY: int(self._backup_length_threshold.total_seconds() // 3600)
}
return backup_variables
[docs] def _store_backup_info(self, backup_info_file_name):
"""
This method writes the backup variables dictionary to a file with the
given filename.
"""
backup_variables = self._dictionarize_backup_info()
with open(backup_info_file_name, 'wb') as backup_info_file:
json.dump(backup_variables, backup_info_file)
[docs] def _find_files_to_backup(self):
"""
Query the database for nodes that were created after the
the start of the last backup. Return a query set.
"""
# Go a bit further back to avoid any rounding problems. Set the
# smallest timestamp to be backed up.
start_of_backup = (self._oldest_object_bk - datetime.timedelta(minutes=self._additional_back_time_mins))
# Find the end of backup for this round using the given _periodicity.
backup_end_for_this_round = (self._oldest_object_bk + datetime.timedelta(days=self._periodicity))
# If the end of the backup is after the given end by the user,
# adapt it accordingly
if (
self._internal_end_date_of_backup is not None and
backup_end_for_this_round > self._internal_end_date_of_backup
):
backup_end_for_this_round = self._internal_end_date_of_backup
# If the end of the backup is after the current time, adapt the end accordingly
now_timestamp = datetime.datetime.now(dtimezone.get_current_timezone())
if backup_end_for_this_round > now_timestamp:
self._logger.info(
'We can not backup until %s. We will backup until now (%s).', backup_end_for_this_round, now_timestamp
)
backup_end_for_this_round = now_timestamp
# Check if the backup length is below the backup length threshold
if backup_end_for_this_round - start_of_backup < \
self._backup_length_threshold:
self._logger.info('Backup (timestamp) length is below the given threshold. Backup finished')
return -1, None
# Construct the queries & query sets
query_sets = self._get_query_sets(start_of_backup, backup_end_for_this_round)
# Set the new start of the backup
self._oldest_object_bk = backup_end_for_this_round
# Check if threshold is 0
if self._backup_length_threshold == datetime.timedelta(hours=0):
return -2, query_sets
return 0, query_sets
[docs] @staticmethod
def _get_repository_path():
from aiida.manage.configuration import get_profile
return get_profile().repository_path
[docs] def _backup_needed_files(self, query_sets):
"""Perform backup of a minimum-set of files"""
repository_path = os.path.normpath(self._get_repository_path())
parent_dir_set = set()
copy_counter = 0
dir_no_to_copy = 0
for query_set in query_sets:
dir_no_to_copy += self._get_query_set_length(query_set)
self._logger.info('Start copying %s directories', dir_no_to_copy)
last_progress_print = datetime.datetime.now()
percent_progress = 0
for query_set in query_sets:
for item in self._get_query_set_iterator(query_set):
source_dir = self._get_source_directory(item)
# Get the relative directory without the / which
# separates the repository_path from the relative_dir.
relative_dir = source_dir[(len(repository_path) + 1):]
destination_dir = os.path.join(self._backup_dir, relative_dir)
# Remove the destination directory if it already exists
if os.path.exists(destination_dir):
shutil.rmtree(destination_dir)
# Copy the needed directory
try:
shutil.copytree(source_dir, destination_dir, True, None)
except EnvironmentError as why:
self._logger.warning(
'Problem copying directory %s to %s. More information: %s (Error no: %s)', source_dir,
destination_dir, why.strerror, why.errno
)
# Raise envEr
# Extract the needed parent directories
AbstractBackup._extract_parent_dirs(relative_dir, parent_dir_set)
copy_counter += 1
log_msg = 'Copied %.0f directories [%s] (%3.0f/100)'
if (
self._logger.getEffectiveLevel() <= logging.INFO and
(datetime.datetime.now() - last_progress_print).seconds > 60
):
last_progress_print = datetime.datetime.now()
percent_progress = copy_counter * 100 / dir_no_to_copy
self._logger.info(log_msg, copy_counter, item.__class__.__name__, percent_progress)
if (
self._logger.getEffectiveLevel() <= logging.INFO and percent_progress <
(copy_counter * 100 / dir_no_to_copy)
):
percent_progress = (copy_counter * 100 / dir_no_to_copy)
last_progress_print = datetime.datetime.now()
self._logger.info(log_msg, copy_counter, item.__class__.__name__, percent_progress)
self._logger.info('%.0f directories copied', copy_counter)
self._logger.info('Start setting permissions')
perm_counter = 0
for tmp_rel_path in parent_dir_set:
try:
shutil.copystat(
os.path.join(repository_path, tmp_rel_path), os.path.join(self._backup_dir, tmp_rel_path)
)
except OSError as why:
self._logger.warning(
'Problem setting permissions to directory %s.', os.path.join(self._backup_dir, tmp_rel_path)
)
self._logger.warning(os.path.join(repository_path, tmp_rel_path))
self._logger.warning('More information: %s (Error no: %s)', why.strerror, why.errno)
perm_counter += 1
self._logger.info('Set correct permissions to %.0f directories.', perm_counter)
self._logger.info('End of backup.')
self._logger.info('Backed up objects with modification timestamp less or equal to %s.', self._oldest_object_bk)
[docs] def run(self):
"""Run the backup"""
while True:
self._read_backup_info_from_file(self._backup_info_filepath)
item_sets_to_backup = self._find_files_to_backup()
if item_sets_to_backup[0] == -1:
break
self._backup_needed_files(item_sets_to_backup[1])
self._store_backup_info(self._backup_info_filepath)
if item_sets_to_backup[0] == -2:
self._logger.info('Threshold is 0. Backed up one round and exiting.')
break
[docs] @abstractmethod
def _query_first_node(self):
"""Query first node"""
[docs] @abstractmethod
def _get_query_set_length(self, query_set):
"""Get query set length"""
[docs] @abstractmethod
def _get_query_sets(self, start_of_backup, backup_end_for_this_round):
"""Get query set"""
[docs] @abstractmethod
def _get_query_set_iterator(self, query_set):
"""Get query set iterator"""
[docs] @abstractmethod
def _get_source_directory(self, item):
"""Get source directory of item
:param self:
:return:
"""
[docs]class BackupError(Exception):
"""General backup error"""
[docs] def __init__(self, value, *args, **kwargs):
super().__init__(*args, **kwargs)
self._value = value
[docs] def __str__(self):
return repr(self._value)