sequences

Biosequence

Base class for string representations of biological polymers (nucleic acids, peptides, proteins...)

Todo

How to handle several mutations at the same time, while keeping indices relevant (after a deletion, a replacement or insertion position might be wrong).

__new__(cls, value, name='', _provenance=None, *args, **kwargs) (staticmethod)

Show source code in core/sequences.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
    def __new__(cls, value, name="", _provenance=None, *args, **kwargs):
        """
        We are subclassing `str` to:

        - provide a `._provenance` dict
        - validate input is part of the allowed alphabet
        """
        diff = set(value).difference(cls.ALPHABET)
        if diff:
            raise ValueError(
                f"Biosequence can only contain characters in {cls.ALPHABET}, "
                f"but found these extra ones: {diff}."
            )
        s = super().__new__(cls, value, *args, **kwargs)
        s.name = name
        s._provenance = {}
        # TODO: We might override some provenance data with this blind update
        if _provenance is not None:
            s._provenance.update(_provenance)
        return s

We are subclassing str to:

  • provide a ._provenance dict
  • validate input is part of the allowed alphabet

cut(self, start, stop, check=True)

Show source code in core/sequences.py
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
    def cut(self, start: str, stop: str, check: bool = True) -> "Biosequence":
        """
        Slice a sequence using biological notation

        Parameters:
            start: Starting element and 1-indexed position; e.g. C123
            stop: Ending element and 1-indexed position; e.g. T234
                This will be included in the resulting sequence
            check: Whether to test if the existing elements correspond
                to those specified in the bounds

        Returns:
            Substring corresponding to [start, end]. Right bound is included!

        __Examples__

        ```python
        >>> s = Biosequence("ATCGTHCTCH")
        >>> s.cut("T2", "T8")
        "TCGTHCT"
        ```
        """
        start_res, start_pos = start[0], int(start[1:])
        stop_res, stop_pos = stop[0], int(stop[1:])
        if check:
            assert (
                start_res == self[start_pos - 1]
            ), f"Element at position {start_pos} is not {start_res}"
            assert (
                stop_res == self[stop_pos - 1]
            ), f"Element at position {stop_pos} is not {stop_res}"
        return self.__class__(
            self[start_pos - 1 : stop_pos],
            name=f"{self.name}{ ' | ' if self.name else '' }Cut: {start}/{stop}",
            _provenance={"cut": (start, stop)},
        )

Slice a sequence using biological notation

Parameters

Name Type Description Default
start str Starting element and 1-indexed position; e.g. C123 required
stop str Ending element and 1-indexed position; e.g. T234 This will be included in the resulting sequence required
check bool Whether to test if the existing elements correspond to those specified in the bounds True

Returns

Type Description
Biosequence Substring corresponding to [start, end]. Right bound is included!

Examples

>>> s = Biosequence("ATCGTHCTCH")
>>> s.cut("T2", "T8")
"TCGTHCT"

from_ncbi(*accessions) (classmethod)

Show source code in core/sequences.py
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
    @classmethod
    def from_ncbi(cls, *accessions: str,) -> Union["Biosequence", Iterable["Biosequence"]]:
        """
        Get FASTA sequence from an online NCBI identifier

        Parameters:
            accessions: NCBI identifier. Multiple can be provided!

        Returns:
            Retrieved biosequence(s)

        __Examples__

        ```python
        >>> sequence = AminoAcidSequence.from_ncbi("AAC05299.1")
        >>> print(sequence[:10])
        MSVNSEKSSS
        >>> print(sequence.name)
        AAC05299.1 serine kinase SRPK2 [Homo sapiens]

        ```
        """
        if cls._ACCESSION_URL is None:
            raise NotImplementedError
        if len(accessions) > cls.ACCESSION_MAX_RETRIEVAL:
            raise ValueError(
                f"You can only provide {cls.ACCESSION_MAX_RETRIEVAL} accessions at the same time."
            )
        r = requests.get(cls._ACCESSION_URL.format(",".join(accessions)))
        r.raise_for_status()
        sequences = []
        for line in r.text.splitlines():
            line = line.strip()
            if not line:
                continue
            if line.startswith(">"):
                sequences.append({"name": line[1:], "sequence": []})
            else:
                sequences[-1]["sequence"].append(line)
        if not sequences:
            return
        objects = []
        for sequence, accession in zip(sequences, accessions):
            obj = cls(
                "".join(sequence["sequence"]),
                name=sequence["name"],
                _provenance={"accession": accession},
            )
            objects.append(obj)
        if not objects:
            return None
        if len(objects) == 1:
            return objects[0]
        return objects

Get FASTA sequence from an online NCBI identifier

Parameters

Name Type Description Default
*accessions str NCBI identifier. Multiple can be provided! ()

Returns

Type Description
Union[ForwardRef('Biosequence'), Iterable[ForwardRef('Biosequence')]] Retrieved biosequence(s)

Examples

>>> sequence = AminoAcidSequence.from_ncbi("AAC05299.1")
>>> print(sequence[:10])
MSVNSEKSSS
>>> print(sequence.name)
AAC05299.1 serine kinase SRPK2 [Homo sapiens]

mutate(self, *mutations, raise_errors=True)

Show source code in core/sequences.py
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
    def mutate(self, *mutations: str, raise_errors: bool = True) -> "Biosequence":
        """
        Apply a mutation on the sequence using biological notation.

        Parameters:
            mutations: Mutations to be applied. Indices are always 1-indexed. It can be one of:
                (1) substitution, like `C234T` (C at position 234 will be replaced by T);
                (2) deletion, like `L746-A750del` (delete everything between L at position 746
                    A at position 750, bounds not included);
                (3) insertion, like `1151Tins` (insert a T after position 1151)
            raise_errors: Raise ValueError if one of the mutations is not supported.

        Returns:
            The edited sequence

        Examples:

        ```python
        >>> s = Biosequence("ATCGTHCTCH")
        >>> s.mutate("C3P")
        "ATPGTHCTCH"
        >>> s.mutate("T2-T5del")
        "ATTHCTCH"
        >>> s.mutate("5Tins")
        "ATCGTTHCTCH"

        ```
        """
        # We can only handle one insertion or deletion at once now
        mutation_types = {m: self._type_mutation(m, raise_errors) for m in mutations}
        mutation_count = Counter(mutation_types.values())
        if mutation_count["insertion"] + mutation_count["deletion"] > 1:
            msg = f"Only one simultaneous insertion or deletion is currently supported. You provided `{','.join(mutations)}`"
            if raise_errors:
                raise ValueError(msg)
            logger.warning("Warning: %s", msg)
            return None

        # Reverse alphabetical order (substitutions will come first)
        mutated = self
        for mutation in sorted(mutations, key=lambda m: mutation_count[m], reverse=True):
            if None in (mutation, mutation_types[mutation]):
                continue
            operation = getattr(mutated, f"_mutate_with_{mutation_types[mutation]}")
            mutated = operation(mutation)
        mutated.name += f" (mutations: {', '.join(mutations)})"
        mutated._provenance.update({"mutations": mutations})
        return mutated

Apply a mutation on the sequence using biological notation.

Parameters

Name Type Description Default
*mutations str Mutations to be applied. Indices are always 1-indexed. It can be one of: (1) substitution, like C234T (C at position 234 will be replaced by T); (2) deletion, like L746-A750del (delete everything between L at position 746 A at position 750, bounds not included); (3) insertion, like 1151Tins (insert a T after position 1151) ()
raise_errors bool Raise ValueError if one of the mutations is not supported. True

Returns

Type Description
Biosequence The edited sequence

Examples:

>>> s = Biosequence("ATCGTHCTCH")
>>> s.mutate("C3P")
"ATPGTHCTCH"
>>> s.mutate("T2-T5del")
"ATTHCTCH"
>>> s.mutate("5Tins")
"ATCGTTHCTCH"

DNASequence

Biosequence that only allows DNA bases

RNASequence

Biosequence that only allows RNA bases


Last update: April 24, 2020