# Source code for aiida.common.escaping

# -*- coding: utf-8 -*-
###########################################################################
# This file is part of the AiiDA code.                                    #
#                                                                         #
# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core #
# For further information on the license, see the LICENSE.txt file        #
# For further information please visit http://www.aiida.net               #
###########################################################################
"""Miscellaneous functions for escaping strings."""

import re

[docs]def escape_for_bash(str_to_escape):
"""
This function takes any string and escapes it in a way that
bash will interpret it as a single string.

Explanation:

At the end, in the return statement, the string is put within single
quotes. Therefore, the only thing that I have to escape in bash is the
single quote character. To do this, I substitute every single
quote ' with '"'"' which means:

First single quote: exit from the enclosing single quotes

Second, third and fourth character: "'" is a single quote character,
escaped by double quotes

Last single quote: reopen the single quote to continue the string

Finally, note that for python I have to enclose the string '"'"'
within triple quotes to make it work, getting finally: the complicated
string found below.
"""
if str_to_escape is None:
return ''

str_to_escape = str(str_to_escape)

escaped_quotes = str_to_escape.replace("'", """'"'"'""")
return f"'{escaped_quotes}'"

# Mapping of "SQL" tokens into corresponding regex expressions
SQL_TO_REGEX_TOKENS = [  # Remember in the strings below we have to escape backslashes as well for python...
# so '\\\\' is actually a string with two backslashes, '\\' a string with a single backslash, ...
('\\\\', re.escape('\\')),  # Double slash should be interpreted as a literal single backslash by regex
('\\%', re.escape('%')),  # literal '\%' should be interpreted as literal % by regex
('\\_', re.escape('_')),  # literal '\_' should be interpreted as literal _ by regex
('%', '.*'),
('_', '.'),
]

[docs]def escape_for_sql_like(string):
"""Function that escapes % or _ symbols provided by user

SQL LIKE syntax summary:

- % -> match any number of characters
- _ -> match exactly one character

"""
return string.replace('%', '\\%').replace('_', '\\_')

[docs]def get_regex_pattern_from_sql(sql_pattern):
r"""
Convert a string providing a pattern to match in SQL
syntax into a string performing the same match as a regex.

SQL LIKE syntax summary:

- % -> match any number of characters
- _ -> match exactly one character

Moreover, \ is the escape character (by default), so:

- \\ -> single backslash
- \% -> literal % symbol
- \_ -> literal _ symbol

and moreover the string should begin at the beginning of the line
and end at the end of the line.

:param sql_pattern: the string with the pattern in SQL syntax
:return: a string with the pattern in regex syntax
"""

def tokenizer(string, tokens_to_apply):
"""
Recursive function that tokenizes a string using the provided tokens

:param string: the string to tokenize
:param tokens_to_apply: the list of tokens still to process (in order: the first will be processed first)
:return: a tokenized and escaped string for regex
"""
if tokens_to_apply:
# We still have tokens to process
# find the first occurrence of the first token passed in the list
# note that the order of the tokens list is important, e.g. we need
# to match first \% and then %
first, sep, rest = string.partition(tokens_to_apply[0])

# There is indeed a separator:
if sep:
# at least one token was found; therefore I have to map tokens[sep]
# to the corresponding regex expression (via the dictionary substitution)
# Moreover, the 'rest' is not empty so I apply recursively tokenizer,
# with ALL tokens passed (there could be more occurrences of tokens_to_apply[0])
# Instead, for the first part, we know that we found the FIRST occurrence of tokens_to_apply[0]
# so I pass the list without the first element
) + dict(SQL_TO_REGEX_TOKENS)[sep] + tokenizer(rest, tokens_to_apply=tokens_to_apply)
# Here sep is empty: it means also rest is empty, and we just
# return (recursively) the tokenizer on the first part, avoiding
# infinite loops
# There is no more token to process: we now have
# a string that we know we want to consider literally, so
# we just make sure it's escaped for a regex to avoid
# that the user passed in weird characters that are valid
# regex symbols like $^ [ ] etc return re.escape(string) return f'^{tokenizer(sql_pattern, tokens_to_apply=[token_pair[0] for token_pair in SQL_TO_REGEX_TOKENS])}$'

[docs]def sql_string_match(string, pattern):
"""
Check if the string matches the provided pattern,
specified using SQL syntax.

See documentation of :py:func:~aiida.common.escaping.get_regex_pattern_from_sql
for an explanation of the syntax.

:param string: the string to check
:param pattern: the SQL pattern
:return: True if the string matches, False otherwise
"""
return bool(re.match(get_regex_pattern_from_sql(pattern), string))