| | 16 | __all__ = ['FormatFlowedDecoder'] |
| | 17 | |
| | 18 | class FormatFlowedDecoder: |
| | 19 | """Object for converting format=flowed text to other formats |
| | 20 | |
| | 21 | The following instance attributes influence the interpretation of |
| | 22 | format=flowed text: |
| | 23 | delete_space (default: False) |
| | 24 | Delete the trailing space before the CRLF on flowed lines before |
| | 25 | interpreting the line on flowed input, corresponds to the DelSp mime |
| | 26 | parameter |
| | 27 | character_set (default: us-ascii) |
| | 28 | The encoding of text passed in. Text is decoded to unicode using this |
| | 29 | encoding, using the default error handing scheme. |
| | 30 | |
| | 31 | """ |
| | 32 | def __init__(self, delete_space=False, character_set='us-ascii'): |
| | 33 | self.delete_space = delete_space |
| | 34 | self.character_set = character_set |
| | 35 | |
| | 36 | # -- Private methods ----------------------------------------------- |
| | 37 | |
| | 38 | def _stripquotes(self, line): |
| | 39 | """Remove quotemarks from the start of the line |
| | 40 | |
| | 41 | Returns the number of quotemarks stripped and the stripped line: |
| | 42 | |
| | 43 | >>> decoder = FormatFlowedDecoder() |
| | 44 | >>> decoder._stripquotes(u'>>> quoted line') |
| | 45 | (3, u' quoted line') |
| | 46 | |
| | 47 | Non-quoted lines are returned unchanged: |
| | 48 | |
| | 49 | >>> decoder._stripquotes(u'non-quoted line') |
| | 50 | (0, u'non-quoted line') |
| | 51 | |
| | 52 | """ |
| | 53 | stripped = line.lstrip('>') |
| | 54 | return len(line) - len(stripped), stripped |
| | 55 | |
| | 56 | def _stripstuffing(self, line): |
| | 57 | """Remove the optional leading space |
| | 58 | |
| | 59 | Returns the stripped line: |
| | 60 | |
| | 61 | >>> decoder = FormatFlowedDecoder() |
| | 62 | >>> decoder._stripstuffing(u' stuffed line') |
| | 63 | u'stuffed line' |
| | 64 | |
| | 65 | Non-stuffed lines are returned unchanged: |
| | 66 | |
| | 67 | >>> decoder._stripstuffing(u'non-stuffed line') |
| | 68 | u'non-stuffed line' |
| | 69 | |
| | 70 | Additional spacing is preserved: |
| | 71 | |
| | 72 | >>> decoder._stripstuffing(u' extra leading space') |
| | 73 | u' extra leading space' |
| | 74 | |
| | 75 | """ |
| | 76 | if line.startswith(u' '): |
| | 77 | return line[1:] |
| | 78 | return line |
| | 79 | |
| | 80 | def _stripflow(self, line): |
| | 81 | """Remove the trailing flow space is delete_space is set |
| | 82 | |
| | 83 | The instance attribute delete_space is False by default thus this |
| | 84 | method returns the line unchanged: |
| | 85 | |
| | 86 | >>> decoder = FormatFlowedDecoder() |
| | 87 | >>> decoder._stripflow(u'flowed line ') |
| | 88 | u'flowed line ' |
| | 89 | |
| | 90 | But if the delete_space attribute has been set to True the flow space |
| | 91 | is removed: |
| | 92 | |
| | 93 | >>> decoder = FormatFlowedDecoder(delete_space=True) |
| | 94 | >>> decoder._stripflow(u'flowed line ') |
| | 95 | u'flowed line' |
| | 96 | |
| | 97 | Only one flow space is removed: |
| | 98 | >>> decoder._stripflow(u'extra whitespace ') |
| | 99 | u'extra whitespace ' |
| | 100 | |
| | 101 | """ |
| | 102 | if self.delete_space and line.endswith(u' '): |
| | 103 | return line[:-1] |
| | 104 | return line |
| | 105 | |
| | 106 | # -- Public API ---------------------------------------------------- |
| | 107 | |
| | 108 | def decode(self, flowed): |
| | 109 | """Decode flowed text |
| | 110 | |
| | 111 | Returns an iterable serving a sequence of (information, chunk) |
| | 112 | tuples. information is a dictionary with the following fields: |
| | 113 | type |
| | 114 | One of 'paragraph', 'fixed', 'signature-separator' |
| | 115 | quotedepth |
| | 116 | Number of quotemarks found on the text chunk |
| | 117 | |
| | 118 | chunk is a unicode string. All text is unwrapped and without any |
| | 119 | quotemarks; when displaying these chunks, the appropriate quotemarks |
| | 120 | should be added again, and chunks of type 'paragraph' should be |
| | 121 | displayed wrapped. Chunks of type 'fixed' should be displayed |
| | 122 | unwrapped. |
| | 123 | |
| | 124 | Here is a simple example: |
| | 125 | |
| | 126 | >>> CRLF = '\\r\\n' |
| | 127 | >>> decoder = FormatFlowedDecoder() |
| | 128 | >>> result = decoder.decode(CRLF.join(( |
| | 129 | ... ">> `Take some more tea,' the March Hare said to Alice, very ", |
| | 130 | ... ">> earnestly.", |
| | 131 | ... ">", |
| | 132 | ... "> `I've had nothing yet,' Alice replied in an offended ", |
| | 133 | ... "> tone, `so I can't take more.'", |
| | 134 | ... "", |
| | 135 | ... "`You mean you can't take less,' said the Hatter: `it's very ", |
| | 136 | ... "easy to take more than nothing.'", |
| | 137 | ... "", |
| | 138 | ... "-- ", |
| | 139 | ... "Lewis Carroll"))) |
| | 140 | >>> list(result) == [ |
| | 141 | ... ({'quotedepth': 2, 'type': 'paragraph'}, |
| | 142 | ... u"`Take some more tea,' the March Hare said to Alice, " |
| | 143 | ... u"very earnestly."), |
| | 144 | ... ({'quotedepth': 1, 'type': 'fixed'}, u""), |
| | 145 | ... ({'quotedepth': 1, 'type': 'paragraph'}, |
| | 146 | ... u"`I've had nothing yet,' Alice replied in an offended " |
| | 147 | ... u"tone, `so I can't take more.'"), |
| | 148 | ... ({'quotedepth': 0, 'type': 'fixed'}, u""), |
| | 149 | ... ({'quotedepth': 0, 'type': 'paragraph'}, |
| | 150 | ... u"`You mean you can't take less,' said the Hatter: `it's " |
| | 151 | ... u"very easy to take more than nothing.'"), |
| | 152 | ... ({'quotedepth': 0, 'type': 'fixed'}, u""), |
| | 153 | ... ({'quotedepth': 0, 'type': 'signature-separator'}, u"-- "), |
| | 154 | ... ({'quotedepth': 0, 'type': 'fixed'}, u"Lewis Carroll") |
| | 155 | ... ] |
| | 156 | True |
| | 157 | |
| | 158 | The decoder can deal with various cases of improperly format=flowed |
| | 159 | messages. Paragraphs normally end with a fixed line, but the following |
| | 160 | cases are also considered paragraph-closing cases: |
| | 161 | |
| | 162 | - A change in quotedepth: |
| | 163 | |
| | 164 | >>> result = decoder.decode(CRLF.join(( |
| | 165 | ... "> Depth one paragraph with flow space. ", |
| | 166 | ... ">> Depth two paragraph with flow space. ", |
| | 167 | ... "Depth zero paragraph with fixed line."))) |
| | 168 | >>> list(result) == [ |
| | 169 | ... ({'quotedepth': 1, 'type': 'paragraph'}, |
| | 170 | ... u"Depth one paragraph with flow space. "), |
| | 171 | ... ({'quotedepth': 2, 'type': 'paragraph'}, |
| | 172 | ... u"Depth two paragraph with flow space. "), |
| | 173 | ... ({'quotedepth': 0, 'type': 'fixed'}, |
| | 174 | ... u"Depth zero paragraph with fixed line.")] |
| | 175 | True |
| | 176 | |
| | 177 | - A signature separator: |
| | 178 | |
| | 179 | >>> result = decoder.decode(CRLF.join(( |
| | 180 | ... "A paragraph with flow space. ", |
| | 181 | ... "-- "))) |
| | 182 | >>> list(result) == [ |
| | 183 | ... ({'quotedepth': 0, 'type': 'paragraph'}, |
| | 184 | ... u"A paragraph with flow space. "), |
| | 185 | ... ({'quotedepth': 0, 'type': 'signature-separator'}, u"-- ")] |
| | 186 | True |
| | 187 | |
| | 188 | - The end of the message: |
| | 189 | |
| | 190 | >>> result = decoder.decode(CRLF.join(( |
| | 191 | ... "A paragraph with flow space. ",))) |
| | 192 | >>> list(result) == [ |
| | 193 | ... ({'quotedepth': 0, 'type': 'paragraph'}, |
| | 194 | ... u"A paragraph with flow space. ")] |
| | 195 | True |
| | 196 | |
| | 197 | The delete_space attribute of the FormatFlowedDecoder class can be used |
| | 198 | to control wether or not the trailing space on flowed lines should be |
| | 199 | retained; this is used to encode flowed text where spaces are rare: |
| | 200 | |
| | 201 | >>> decoder = FormatFlowedDecoder(delete_space=True) |
| | 202 | >>> result = decoder.decode(CRLF.join(( |
| | 203 | ... "Contrived example with a word- ", |
| | 204 | ... "break across the paragraph."))) |
| | 205 | >>> list(result) == [ |
| | 206 | ... ({'quotedepth': 0, 'type': 'paragraph'}, |
| | 207 | ... u'Contrived example with a word-break across the ' |
| | 208 | ... u'paragraph.')] |
| | 209 | True |
| | 210 | |
| | 211 | """ |
| | 212 | para = u'' |
| | 213 | pinfo = {'type': 'paragraph'} |
| | 214 | for line in flowed.decode(self.character_set).split('\r\n'): |
| | 215 | quotedepth, line = self._stripquotes(line) |
| | 216 | line = self._stripstuffing(line) |
| | 217 | if line == '-- ': |
| | 218 | # signature separator |
| | 219 | if para: |
| | 220 | # exception case: flowed line followed by sig-sep |
| | 221 | yield (pinfo, para) |
| | 222 | pinfo = {'type': 'paragraph'} |
| | 223 | para = u'' |
| | 224 | yield ({'type': 'signature-separator', |
| | 225 | 'quotedepth': quotedepth}, line) |
| | 226 | continue |
| | 227 | if line.endswith(u' '): |
| | 228 | # flowed line; collect into a paragraph |
| | 229 | if quotedepth != pinfo.get('quotedepth', quotedepth): |
| | 230 | # exception case: flowed line followed by quotedepth change |
| | 231 | yield (pinfo, para) |
| | 232 | pinfo = {'type': 'paragraph'} |
| | 233 | para = u'' |
| | 234 | para += self._stripflow(line) |
| | 235 | pinfo['quotedepth'] = quotedepth |
| | 236 | continue |
| | 237 | # fixed line |
| | 238 | if para: |
| | 239 | # completed paragraph |
| | 240 | if quotedepth != pinfo.get('quotedepth', quotedepth): |
| | 241 | # exception case: flowed line followed by quotedepth change |
| | 242 | yield (pinfo, para) |
| | 243 | pinfo = {'type': 'paragraph'} |
| | 244 | para = u'' |
| | 245 | else: |
| | 246 | yield (pinfo, para + line) |
| | 247 | pinfo = {'type': 'paragraph'} |
| | 248 | para = u'' |
| | 249 | continue |
| | 250 | yield ({'type': 'fixed', 'quotedepth': quotedepth}, line) |
| | 251 | |
| | 252 | if para: |
| | 253 | # exception case: last line was a flowed line |
| | 254 | yield (pinfo, para) |
| | 255 | |