multipart.py 70 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893
  1. from .decoders import *
  2. from .exceptions import *
  3. import os
  4. import re
  5. import sys
  6. import shutil
  7. import logging
  8. import tempfile
  9. from io import BytesIO
  10. from numbers import Number
  11. # Unique missing object.
  12. _missing = object()
  13. # States for the querystring parser.
  14. STATE_BEFORE_FIELD = 0
  15. STATE_FIELD_NAME = 1
  16. STATE_FIELD_DATA = 2
  17. # States for the multipart parser
  18. STATE_START = 0
  19. STATE_START_BOUNDARY = 1
  20. STATE_HEADER_FIELD_START = 2
  21. STATE_HEADER_FIELD = 3
  22. STATE_HEADER_VALUE_START = 4
  23. STATE_HEADER_VALUE = 5
  24. STATE_HEADER_VALUE_ALMOST_DONE = 6
  25. STATE_HEADERS_ALMOST_DONE = 7
  26. STATE_PART_DATA_START = 8
  27. STATE_PART_DATA = 9
  28. STATE_PART_DATA_END = 10
  29. STATE_END = 11
  30. STATES = [
  31. "START",
  32. "START_BOUNDARY", "HEADER_FIELD_START", "HEADER_FIELD", "HEADER_VALUE_START", "HEADER_VALUE",
  33. "HEADER_VALUE_ALMOST_DONE", "HEADRES_ALMOST_DONE", "PART_DATA_START", "PART_DATA", "PART_DATA_END", "END"
  34. ]
  35. # Flags for the multipart parser.
  36. FLAG_PART_BOUNDARY = 1
  37. FLAG_LAST_BOUNDARY = 2
  38. # Get constants. Since iterating over a str on Python 2 gives you a 1-length
  39. # string, but iterating over a bytes object on Python 3 gives you an integer,
  40. # we need to save these constants.
  41. CR = b'\r'[0]
  42. LF = b'\n'[0]
  43. COLON = b':'[0]
  44. SPACE = b' '[0]
  45. HYPHEN = b'-'[0]
  46. AMPERSAND = b'&'[0]
  47. SEMICOLON = b';'[0]
  48. LOWER_A = b'a'[0]
  49. LOWER_Z = b'z'[0]
  50. NULL = b'\x00'[0]
  51. # Lower-casing a character is different, because of the difference between
  52. # str on Py2, and bytes on Py3. Same with getting the ordinal value of a byte,
  53. # and joining a list of bytes together.
  54. # These functions abstract that.
  55. lower_char = lambda c: c | 0x20
  56. ord_char = lambda c: c
  57. join_bytes = lambda b: bytes(list(b))
  58. # These are regexes for parsing header values.
  59. SPECIAL_CHARS = re.escape(b'()<>@,;:\\"/[]?={} \t')
  60. QUOTED_STR = br'"(?:\\.|[^"])*"'
  61. VALUE_STR = br'(?:[^' + SPECIAL_CHARS + br']+|' + QUOTED_STR + br')'
  62. OPTION_RE_STR = (
  63. br'(?:;|^)\s*([^' + SPECIAL_CHARS + br']+)\s*=\s*(' + VALUE_STR + br')'
  64. )
  65. OPTION_RE = re.compile(OPTION_RE_STR)
  66. QUOTE = b'"'[0]
  67. def parse_options_header(value):
  68. """
  69. Parses a Content-Type header into a value in the following format:
  70. (content_type, {parameters})
  71. """
  72. if not value:
  73. return (b'', {})
  74. # If we are passed a string, we assume that it conforms to WSGI and does
  75. # not contain any code point that's not in latin-1.
  76. if isinstance(value, str): # pragma: no cover
  77. value = value.encode('latin-1')
  78. # If we have no options, return the string as-is.
  79. if b';' not in value:
  80. return (value.lower().strip(), {})
  81. # Split at the first semicolon, to get our value and then options.
  82. ctype, rest = value.split(b';', 1)
  83. options = {}
  84. # Parse the options.
  85. for match in OPTION_RE.finditer(rest):
  86. key = match.group(1).lower()
  87. value = match.group(2)
  88. if value[0] == QUOTE and value[-1] == QUOTE:
  89. # Unquote the value.
  90. value = value[1:-1]
  91. value = value.replace(b'\\\\', b'\\').replace(b'\\"', b'"')
  92. # If the value is a filename, we need to fix a bug on IE6 that sends
  93. # the full file path instead of the filename.
  94. if key == b'filename':
  95. if value[1:3] == b':\\' or value[:2] == b'\\\\':
  96. value = value.split(b'\\')[-1]
  97. options[key] = value
  98. return ctype, options
  99. class Field:
  100. """A Field object represents a (parsed) form field. It represents a single
  101. field with a corresponding name and value.
  102. The name that a :class:`Field` will be instantiated with is the same name
  103. that would be found in the following HTML::
  104. <input name="name_goes_here" type="text"/>
  105. This class defines two methods, :meth:`on_data` and :meth:`on_end`, that
  106. will be called when data is written to the Field, and when the Field is
  107. finalized, respectively.
  108. :param name: the name of the form field
  109. """
  110. def __init__(self, name):
  111. self._name = name
  112. self._value = []
  113. # We cache the joined version of _value for speed.
  114. self._cache = _missing
  115. @classmethod
  116. def from_value(klass, name, value):
  117. """Create an instance of a :class:`Field`, and set the corresponding
  118. value - either None or an actual value. This method will also
  119. finalize the Field itself.
  120. :param name: the name of the form field
  121. :param value: the value of the form field - either a bytestring or
  122. None
  123. """
  124. f = klass(name)
  125. if value is None:
  126. f.set_none()
  127. else:
  128. f.write(value)
  129. f.finalize()
  130. return f
  131. def write(self, data):
  132. """Write some data into the form field.
  133. :param data: a bytestring
  134. """
  135. return self.on_data(data)
  136. def on_data(self, data):
  137. """This method is a callback that will be called whenever data is
  138. written to the Field.
  139. :param data: a bytestring
  140. """
  141. self._value.append(data)
  142. self._cache = _missing
  143. return len(data)
  144. def on_end(self):
  145. """This method is called whenever the Field is finalized.
  146. """
  147. if self._cache is _missing:
  148. self._cache = b''.join(self._value)
  149. def finalize(self):
  150. """Finalize the form field.
  151. """
  152. self.on_end()
  153. def close(self):
  154. """Close the Field object. This will free any underlying cache.
  155. """
  156. # Free our value array.
  157. if self._cache is _missing:
  158. self._cache = b''.join(self._value)
  159. del self._value
  160. def set_none(self):
  161. """Some fields in a querystring can possibly have a value of None - for
  162. example, the string "foo&bar=&baz=asdf" will have a field with the
  163. name "foo" and value None, one with name "bar" and value "", and one
  164. with name "baz" and value "asdf". Since the write() interface doesn't
  165. support writing None, this function will set the field value to None.
  166. """
  167. self._cache = None
  168. @property
  169. def field_name(self):
  170. """This property returns the name of the field."""
  171. return self._name
  172. @property
  173. def value(self):
  174. """This property returns the value of the form field."""
  175. if self._cache is _missing:
  176. self._cache = b''.join(self._value)
  177. return self._cache
  178. def __eq__(self, other):
  179. if isinstance(other, Field):
  180. return (
  181. self.field_name == other.field_name and
  182. self.value == other.value
  183. )
  184. else:
  185. return NotImplemented
  186. def __repr__(self):
  187. if len(self.value) > 97:
  188. # We get the repr, and then insert three dots before the final
  189. # quote.
  190. v = repr(self.value[:97])[:-1] + "...'"
  191. else:
  192. v = repr(self.value)
  193. return "{}(field_name={!r}, value={})".format(
  194. self.__class__.__name__,
  195. self.field_name,
  196. v
  197. )
  198. class File:
  199. """This class represents an uploaded file. It handles writing file data to
  200. either an in-memory file or a temporary file on-disk, if the optional
  201. threshold is passed.
  202. There are some options that can be passed to the File to change behavior
  203. of the class. Valid options are as follows:
  204. .. list-table::
  205. :widths: 15 5 5 30
  206. :header-rows: 1
  207. * - Name
  208. - Type
  209. - Default
  210. - Description
  211. * - UPLOAD_DIR
  212. - `str`
  213. - None
  214. - The directory to store uploaded files in. If this is None, a
  215. temporary file will be created in the system's standard location.
  216. * - UPLOAD_DELETE_TMP
  217. - `bool`
  218. - True
  219. - Delete automatically created TMP file
  220. * - UPLOAD_KEEP_FILENAME
  221. - `bool`
  222. - False
  223. - Whether or not to keep the filename of the uploaded file. If True,
  224. then the filename will be converted to a safe representation (e.g.
  225. by removing any invalid path segments), and then saved with the
  226. same name). Otherwise, a temporary name will be used.
  227. * - UPLOAD_KEEP_EXTENSIONS
  228. - `bool`
  229. - False
  230. - Whether or not to keep the uploaded file's extension. If False, the
  231. file will be saved with the default temporary extension (usually
  232. ".tmp"). Otherwise, the file's extension will be maintained. Note
  233. that this will properly combine with the UPLOAD_KEEP_FILENAME
  234. setting.
  235. * - MAX_MEMORY_FILE_SIZE
  236. - `int`
  237. - 1 MiB
  238. - The maximum number of bytes of a File to keep in memory. By
  239. default, the contents of a File are kept into memory until a certain
  240. limit is reached, after which the contents of the File are written
  241. to a temporary file. This behavior can be disabled by setting this
  242. value to an appropriately large value (or, for example, infinity,
  243. such as `float('inf')`.
  244. :param file_name: The name of the file that this :class:`File` represents
  245. :param field_name: The field name that uploaded this file. Note that this
  246. can be None, if, for example, the file was uploaded
  247. with Content-Type application/octet-stream
  248. :param config: The configuration for this File. See above for valid
  249. configuration keys and their corresponding values.
  250. """
  251. def __init__(self, file_name, field_name=None, config={}):
  252. # Save configuration, set other variables default.
  253. self.logger = logging.getLogger(__name__)
  254. self._config = config
  255. self._in_memory = True
  256. self._bytes_written = 0
  257. self._fileobj = BytesIO()
  258. # Save the provided field/file name.
  259. self._field_name = field_name
  260. self._file_name = file_name
  261. # Our actual file name is None by default, since, depending on our
  262. # config, we may not actually use the provided name.
  263. self._actual_file_name = None
  264. # Split the extension from the filename.
  265. if file_name is not None:
  266. base, ext = os.path.splitext(file_name)
  267. self._file_base = base
  268. self._ext = ext
  269. @property
  270. def field_name(self):
  271. """The form field associated with this file. May be None if there isn't
  272. one, for example when we have an application/octet-stream upload.
  273. """
  274. return self._field_name
  275. @property
  276. def file_name(self):
  277. """The file name given in the upload request.
  278. """
  279. return self._file_name
  280. @property
  281. def actual_file_name(self):
  282. """The file name that this file is saved as. Will be None if it's not
  283. currently saved on disk.
  284. """
  285. return self._actual_file_name
  286. @property
  287. def file_object(self):
  288. """The file object that we're currently writing to. Note that this
  289. will either be an instance of a :class:`io.BytesIO`, or a regular file
  290. object.
  291. """
  292. return self._fileobj
  293. @property
  294. def size(self):
  295. """The total size of this file, counted as the number of bytes that
  296. currently have been written to the file.
  297. """
  298. return self._bytes_written
  299. @property
  300. def in_memory(self):
  301. """A boolean representing whether or not this file object is currently
  302. stored in-memory or on-disk.
  303. """
  304. return self._in_memory
  305. def flush_to_disk(self):
  306. """If the file is already on-disk, do nothing. Otherwise, copy from
  307. the in-memory buffer to a disk file, and then reassign our internal
  308. file object to this new disk file.
  309. Note that if you attempt to flush a file that is already on-disk, a
  310. warning will be logged to this module's logger.
  311. """
  312. if not self._in_memory:
  313. self.logger.warning(
  314. "Trying to flush to disk when we're not in memory"
  315. )
  316. return
  317. # Go back to the start of our file.
  318. self._fileobj.seek(0)
  319. # Open a new file.
  320. new_file = self._get_disk_file()
  321. # Copy the file objects.
  322. shutil.copyfileobj(self._fileobj, new_file)
  323. # Seek to the new position in our new file.
  324. new_file.seek(self._bytes_written)
  325. # Reassign the fileobject.
  326. old_fileobj = self._fileobj
  327. self._fileobj = new_file
  328. # We're no longer in memory.
  329. self._in_memory = False
  330. # Close the old file object.
  331. old_fileobj.close()
  332. def _get_disk_file(self):
  333. """This function is responsible for getting a file object on-disk for us.
  334. """
  335. self.logger.info("Opening a file on disk")
  336. file_dir = self._config.get('UPLOAD_DIR')
  337. keep_filename = self._config.get('UPLOAD_KEEP_FILENAME', False)
  338. keep_extensions = self._config.get('UPLOAD_KEEP_EXTENSIONS', False)
  339. delete_tmp = self._config.get('UPLOAD_DELETE_TMP', True)
  340. # If we have a directory and are to keep the filename...
  341. if file_dir is not None and keep_filename:
  342. self.logger.info("Saving with filename in: %r", file_dir)
  343. # Build our filename.
  344. # TODO: what happens if we don't have a filename?
  345. fname = self._file_base
  346. if keep_extensions:
  347. fname = fname + self._ext
  348. path = os.path.join(file_dir, fname)
  349. try:
  350. self.logger.info("Opening file: %r", path)
  351. tmp_file = open(path, 'w+b')
  352. except OSError as e:
  353. tmp_file = None
  354. self.logger.exception("Error opening temporary file")
  355. raise FileError("Error opening temporary file: %r" % path)
  356. else:
  357. # Build options array.
  358. # Note that on Python 3, tempfile doesn't support byte names. We
  359. # encode our paths using the default filesystem encoding.
  360. options = {}
  361. if keep_extensions:
  362. ext = self._ext
  363. if isinstance(ext, bytes):
  364. ext = ext.decode(sys.getfilesystemencoding())
  365. options['suffix'] = ext
  366. if file_dir is not None:
  367. d = file_dir
  368. if isinstance(d, bytes):
  369. d = d.decode(sys.getfilesystemencoding())
  370. options['dir'] = d
  371. options['delete'] = delete_tmp
  372. # Create a temporary (named) file with the appropriate settings.
  373. self.logger.info("Creating a temporary file with options: %r",
  374. options)
  375. try:
  376. tmp_file = tempfile.NamedTemporaryFile(**options)
  377. except OSError:
  378. self.logger.exception("Error creating named temporary file")
  379. raise FileError("Error creating named temporary file")
  380. fname = tmp_file.name
  381. # Encode filename as bytes.
  382. if isinstance(fname, str):
  383. fname = fname.encode(sys.getfilesystemencoding())
  384. self._actual_file_name = fname
  385. return tmp_file
  386. def write(self, data):
  387. """Write some data to the File.
  388. :param data: a bytestring
  389. """
  390. return self.on_data(data)
  391. def on_data(self, data):
  392. """This method is a callback that will be called whenever data is
  393. written to the File.
  394. :param data: a bytestring
  395. """
  396. pos = self._fileobj.tell()
  397. bwritten = self._fileobj.write(data)
  398. # true file objects write returns None
  399. if bwritten is None:
  400. bwritten = self._fileobj.tell() - pos
  401. # If the bytes written isn't the same as the length, just return.
  402. if bwritten != len(data):
  403. self.logger.warning("bwritten != len(data) (%d != %d)", bwritten,
  404. len(data))
  405. return bwritten
  406. # Keep track of how many bytes we've written.
  407. self._bytes_written += bwritten
  408. # If we're in-memory and are over our limit, we create a file.
  409. if (self._in_memory and
  410. self._config.get('MAX_MEMORY_FILE_SIZE') is not None and
  411. (self._bytes_written >
  412. self._config.get('MAX_MEMORY_FILE_SIZE'))):
  413. self.logger.info("Flushing to disk")
  414. self.flush_to_disk()
  415. # Return the number of bytes written.
  416. return bwritten
  417. def on_end(self):
  418. """This method is called whenever the Field is finalized.
  419. """
  420. # Flush the underlying file object
  421. self._fileobj.flush()
  422. def finalize(self):
  423. """Finalize the form file. This will not close the underlying file,
  424. but simply signal that we are finished writing to the File.
  425. """
  426. self.on_end()
  427. def close(self):
  428. """Close the File object. This will actually close the underlying
  429. file object (whether it's a :class:`io.BytesIO` or an actual file
  430. object).
  431. """
  432. self._fileobj.close()
  433. def __repr__(self):
  434. return "{}(file_name={!r}, field_name={!r})".format(
  435. self.__class__.__name__,
  436. self.file_name,
  437. self.field_name
  438. )
  439. class BaseParser:
  440. """This class is the base class for all parsers. It contains the logic for
  441. calling and adding callbacks.
  442. A callback can be one of two different forms. "Notification callbacks" are
  443. callbacks that are called when something happens - for example, when a new
  444. part of a multipart message is encountered by the parser. "Data callbacks"
  445. are called when we get some sort of data - for example, part of the body of
  446. a multipart chunk. Notification callbacks are called with no parameters,
  447. whereas data callbacks are called with three, as follows::
  448. data_callback(data, start, end)
  449. The "data" parameter is a bytestring (i.e. "foo" on Python 2, or b"foo" on
  450. Python 3). "start" and "end" are integer indexes into the "data" string
  451. that represent the data of interest. Thus, in a data callback, the slice
  452. `data[start:end]` represents the data that the callback is "interested in".
  453. The callback is not passed a copy of the data, since copying severely hurts
  454. performance.
  455. """
  456. def __init__(self):
  457. self.logger = logging.getLogger(__name__)
  458. def callback(self, name, data=None, start=None, end=None):
  459. """This function calls a provided callback with some data. If the
  460. callback is not set, will do nothing.
  461. :param name: The name of the callback to call (as a string).
  462. :param data: Data to pass to the callback. If None, then it is
  463. assumed that the callback is a notification callback,
  464. and no parameters are given.
  465. :param end: An integer that is passed to the data callback.
  466. :param start: An integer that is passed to the data callback.
  467. """
  468. name = "on_" + name
  469. func = self.callbacks.get(name)
  470. if func is None:
  471. return
  472. # Depending on whether we're given a buffer...
  473. if data is not None:
  474. # Don't do anything if we have start == end.
  475. if start is not None and start == end:
  476. return
  477. self.logger.debug("Calling %s with data[%d:%d]", name, start, end)
  478. func(data, start, end)
  479. else:
  480. self.logger.debug("Calling %s with no data", name)
  481. func()
  482. def set_callback(self, name, new_func):
  483. """Update the function for a callback. Removes from the callbacks dict
  484. if new_func is None.
  485. :param name: The name of the callback to call (as a string).
  486. :param new_func: The new function for the callback. If None, then the
  487. callback will be removed (with no error if it does not
  488. exist).
  489. """
  490. if new_func is None:
  491. self.callbacks.pop('on_' + name, None)
  492. else:
  493. self.callbacks['on_' + name] = new_func
  494. def close(self):
  495. pass # pragma: no cover
  496. def finalize(self):
  497. pass # pragma: no cover
  498. def __repr__(self):
  499. return "%s()" % self.__class__.__name__
  500. class OctetStreamParser(BaseParser):
  501. """This parser parses an octet-stream request body and calls callbacks when
  502. incoming data is received. Callbacks are as follows:
  503. .. list-table::
  504. :widths: 15 10 30
  505. :header-rows: 1
  506. * - Callback Name
  507. - Parameters
  508. - Description
  509. * - on_start
  510. - None
  511. - Called when the first data is parsed.
  512. * - on_data
  513. - data, start, end
  514. - Called for each data chunk that is parsed.
  515. * - on_end
  516. - None
  517. - Called when the parser is finished parsing all data.
  518. :param callbacks: A dictionary of callbacks. See the documentation for
  519. :class:`BaseParser`.
  520. :param max_size: The maximum size of body to parse. Defaults to infinity -
  521. i.e. unbounded.
  522. """
  523. def __init__(self, callbacks={}, max_size=float('inf')):
  524. super().__init__()
  525. self.callbacks = callbacks
  526. self._started = False
  527. if not isinstance(max_size, Number) or max_size < 1:
  528. raise ValueError("max_size must be a positive number, not %r" %
  529. max_size)
  530. self.max_size = max_size
  531. self._current_size = 0
  532. def write(self, data):
  533. """Write some data to the parser, which will perform size verification,
  534. and then pass the data to the underlying callback.
  535. :param data: a bytestring
  536. """
  537. if not self._started:
  538. self.callback('start')
  539. self._started = True
  540. # Truncate data length.
  541. data_len = len(data)
  542. if (self._current_size + data_len) > self.max_size:
  543. # We truncate the length of data that we are to process.
  544. new_size = int(self.max_size - self._current_size)
  545. self.logger.warning("Current size is %d (max %d), so truncating "
  546. "data length from %d to %d",
  547. self._current_size, self.max_size, data_len,
  548. new_size)
  549. data_len = new_size
  550. # Increment size, then callback, in case there's an exception.
  551. self._current_size += data_len
  552. self.callback('data', data, 0, data_len)
  553. return data_len
  554. def finalize(self):
  555. """Finalize this parser, which signals to that we are finished parsing,
  556. and sends the on_end callback.
  557. """
  558. self.callback('end')
  559. def __repr__(self):
  560. return "%s()" % self.__class__.__name__
  561. class QuerystringParser(BaseParser):
  562. """This is a streaming querystring parser. It will consume data, and call
  563. the callbacks given when it has data.
  564. .. list-table::
  565. :widths: 15 10 30
  566. :header-rows: 1
  567. * - Callback Name
  568. - Parameters
  569. - Description
  570. * - on_field_start
  571. - None
  572. - Called when a new field is encountered.
  573. * - on_field_name
  574. - data, start, end
  575. - Called when a portion of a field's name is encountered.
  576. * - on_field_data
  577. - data, start, end
  578. - Called when a portion of a field's data is encountered.
  579. * - on_field_end
  580. - None
  581. - Called when the end of a field is encountered.
  582. * - on_end
  583. - None
  584. - Called when the parser is finished parsing all data.
  585. :param callbacks: A dictionary of callbacks. See the documentation for
  586. :class:`BaseParser`.
  587. :param strict_parsing: Whether or not to parse the body strictly. Defaults
  588. to False. If this is set to True, then the behavior
  589. of the parser changes as the following: if a field
  590. has a value with an equal sign (e.g. "foo=bar", or
  591. "foo="), it is always included. If a field has no
  592. equals sign (e.g. "...&name&..."), it will be
  593. treated as an error if 'strict_parsing' is True,
  594. otherwise included. If an error is encountered,
  595. then a
  596. :class:`multipart.exceptions.QuerystringParseError`
  597. will be raised.
  598. :param max_size: The maximum size of body to parse. Defaults to infinity -
  599. i.e. unbounded.
  600. """
  601. def __init__(self, callbacks={}, strict_parsing=False,
  602. max_size=float('inf')):
  603. super().__init__()
  604. self.state = STATE_BEFORE_FIELD
  605. self._found_sep = False
  606. self.callbacks = callbacks
  607. # Max-size stuff
  608. if not isinstance(max_size, Number) or max_size < 1:
  609. raise ValueError("max_size must be a positive number, not %r" %
  610. max_size)
  611. self.max_size = max_size
  612. self._current_size = 0
  613. # Should parsing be strict?
  614. self.strict_parsing = strict_parsing
  615. def write(self, data):
  616. """Write some data to the parser, which will perform size verification,
  617. parse into either a field name or value, and then pass the
  618. corresponding data to the underlying callback. If an error is
  619. encountered while parsing, a QuerystringParseError will be raised. The
  620. "offset" attribute of the raised exception will be set to the offset in
  621. the input data chunk (NOT the overall stream) that caused the error.
  622. :param data: a bytestring
  623. """
  624. # Handle sizing.
  625. data_len = len(data)
  626. if (self._current_size + data_len) > self.max_size:
  627. # We truncate the length of data that we are to process.
  628. new_size = int(self.max_size - self._current_size)
  629. self.logger.warning("Current size is %d (max %d), so truncating "
  630. "data length from %d to %d",
  631. self._current_size, self.max_size, data_len,
  632. new_size)
  633. data_len = new_size
  634. l = 0
  635. try:
  636. l = self._internal_write(data, data_len)
  637. finally:
  638. self._current_size += l
  639. return l
  640. def _internal_write(self, data, length):
  641. state = self.state
  642. strict_parsing = self.strict_parsing
  643. found_sep = self._found_sep
  644. i = 0
  645. while i < length:
  646. ch = data[i]
  647. # Depending on our state...
  648. if state == STATE_BEFORE_FIELD:
  649. # If the 'found_sep' flag is set, we've already encountered
  650. # and skipped a single separator. If so, we check our strict
  651. # parsing flag and decide what to do. Otherwise, we haven't
  652. # yet reached a separator, and thus, if we do, we need to skip
  653. # it as it will be the boundary between fields that's supposed
  654. # to be there.
  655. if ch == AMPERSAND or ch == SEMICOLON:
  656. if found_sep:
  657. # If we're parsing strictly, we disallow blank chunks.
  658. if strict_parsing:
  659. e = QuerystringParseError(
  660. "Skipping duplicate ampersand/semicolon at "
  661. "%d" % i
  662. )
  663. e.offset = i
  664. raise e
  665. else:
  666. self.logger.debug("Skipping duplicate ampersand/"
  667. "semicolon at %d", i)
  668. else:
  669. # This case is when we're skipping the (first)
  670. # separator between fields, so we just set our flag
  671. # and continue on.
  672. found_sep = True
  673. else:
  674. # Emit a field-start event, and go to that state. Also,
  675. # reset the "found_sep" flag, for the next time we get to
  676. # this state.
  677. self.callback('field_start')
  678. i -= 1
  679. state = STATE_FIELD_NAME
  680. found_sep = False
  681. elif state == STATE_FIELD_NAME:
  682. # Try and find a separator - we ensure that, if we do, we only
  683. # look for the equal sign before it.
  684. sep_pos = data.find(b'&', i)
  685. if sep_pos == -1:
  686. sep_pos = data.find(b';', i)
  687. # See if we can find an equals sign in the remaining data. If
  688. # so, we can immediately emit the field name and jump to the
  689. # data state.
  690. if sep_pos != -1:
  691. equals_pos = data.find(b'=', i, sep_pos)
  692. else:
  693. equals_pos = data.find(b'=', i)
  694. if equals_pos != -1:
  695. # Emit this name.
  696. self.callback('field_name', data, i, equals_pos)
  697. # Jump i to this position. Note that it will then have 1
  698. # added to it below, which means the next iteration of this
  699. # loop will inspect the character after the equals sign.
  700. i = equals_pos
  701. state = STATE_FIELD_DATA
  702. else:
  703. # No equals sign found.
  704. if not strict_parsing:
  705. # See also comments in the STATE_FIELD_DATA case below.
  706. # If we found the separator, we emit the name and just
  707. # end - there's no data callback at all (not even with
  708. # a blank value).
  709. if sep_pos != -1:
  710. self.callback('field_name', data, i, sep_pos)
  711. self.callback('field_end')
  712. i = sep_pos - 1
  713. state = STATE_BEFORE_FIELD
  714. else:
  715. # Otherwise, no separator in this block, so the
  716. # rest of this chunk must be a name.
  717. self.callback('field_name', data, i, length)
  718. i = length
  719. else:
  720. # We're parsing strictly. If we find a separator,
  721. # this is an error - we require an equals sign.
  722. if sep_pos != -1:
  723. e = QuerystringParseError(
  724. "When strict_parsing is True, we require an "
  725. "equals sign in all field chunks. Did not "
  726. "find one in the chunk that starts at %d" %
  727. (i,)
  728. )
  729. e.offset = i
  730. raise e
  731. # No separator in the rest of this chunk, so it's just
  732. # a field name.
  733. self.callback('field_name', data, i, length)
  734. i = length
  735. elif state == STATE_FIELD_DATA:
  736. # Try finding either an ampersand or a semicolon after this
  737. # position.
  738. sep_pos = data.find(b'&', i)
  739. if sep_pos == -1:
  740. sep_pos = data.find(b';', i)
  741. # If we found it, callback this bit as data and then go back
  742. # to expecting to find a field.
  743. if sep_pos != -1:
  744. self.callback('field_data', data, i, sep_pos)
  745. self.callback('field_end')
  746. # Note that we go to the separator, which brings us to the
  747. # "before field" state. This allows us to properly emit
  748. # "field_start" events only when we actually have data for
  749. # a field of some sort.
  750. i = sep_pos - 1
  751. state = STATE_BEFORE_FIELD
  752. # Otherwise, emit the rest as data and finish.
  753. else:
  754. self.callback('field_data', data, i, length)
  755. i = length
  756. else: # pragma: no cover (error case)
  757. msg = "Reached an unknown state %d at %d" % (state, i)
  758. self.logger.warning(msg)
  759. e = QuerystringParseError(msg)
  760. e.offset = i
  761. raise e
  762. i += 1
  763. self.state = state
  764. self._found_sep = found_sep
  765. return len(data)
  766. def finalize(self):
  767. """Finalize this parser, which signals to that we are finished parsing,
  768. if we're still in the middle of a field, an on_field_end callback, and
  769. then the on_end callback.
  770. """
  771. # If we're currently in the middle of a field, we finish it.
  772. if self.state == STATE_FIELD_DATA:
  773. self.callback('field_end')
  774. self.callback('end')
  775. def __repr__(self):
  776. return "{}(strict_parsing={!r}, max_size={!r})".format(
  777. self.__class__.__name__,
  778. self.strict_parsing, self.max_size
  779. )
  780. class MultipartParser(BaseParser):
  781. """This class is a streaming multipart/form-data parser.
  782. .. list-table::
  783. :widths: 15 10 30
  784. :header-rows: 1
  785. * - Callback Name
  786. - Parameters
  787. - Description
  788. * - on_part_begin
  789. - None
  790. - Called when a new part of the multipart message is encountered.
  791. * - on_part_data
  792. - data, start, end
  793. - Called when a portion of a part's data is encountered.
  794. * - on_part_end
  795. - None
  796. - Called when the end of a part is reached.
  797. * - on_header_begin
  798. - None
  799. - Called when we've found a new header in a part of a multipart
  800. message
  801. * - on_header_field
  802. - data, start, end
  803. - Called each time an additional portion of a header is read (i.e. the
  804. part of the header that is before the colon; the "Foo" in
  805. "Foo: Bar").
  806. * - on_header_value
  807. - data, start, end
  808. - Called when we get data for a header.
  809. * - on_header_end
  810. - None
  811. - Called when the current header is finished - i.e. we've reached the
  812. newline at the end of the header.
  813. * - on_headers_finished
  814. - None
  815. - Called when all headers are finished, and before the part data
  816. starts.
  817. * - on_end
  818. - None
  819. - Called when the parser is finished parsing all data.
  820. :param boundary: The multipart boundary. This is required, and must match
  821. what is given in the HTTP request - usually in the
  822. Content-Type header.
  823. :param callbacks: A dictionary of callbacks. See the documentation for
  824. :class:`BaseParser`.
  825. :param max_size: The maximum size of body to parse. Defaults to infinity -
  826. i.e. unbounded.
  827. """
  828. def __init__(self, boundary, callbacks={}, max_size=float('inf')):
  829. # Initialize parser state.
  830. super().__init__()
  831. self.state = STATE_START
  832. self.index = self.flags = 0
  833. self.callbacks = callbacks
  834. if not isinstance(max_size, Number) or max_size < 1:
  835. raise ValueError("max_size must be a positive number, not %r" %
  836. max_size)
  837. self.max_size = max_size
  838. self._current_size = 0
  839. # Setup marks. These are used to track the state of data received.
  840. self.marks = {}
  841. # TODO: Actually use this rather than the dumb version we currently use
  842. # # Precompute the skip table for the Boyer-Moore-Horspool algorithm.
  843. # skip = [len(boundary) for x in range(256)]
  844. # for i in range(len(boundary) - 1):
  845. # skip[ord_char(boundary[i])] = len(boundary) - i - 1
  846. #
  847. # # We use a tuple since it's a constant, and marginally faster.
  848. # self.skip = tuple(skip)
  849. # Save our boundary.
  850. if isinstance(boundary, str): # pragma: no cover
  851. boundary = boundary.encode('latin-1')
  852. self.boundary = b'\r\n--' + boundary
  853. # Get a set of characters that belong to our boundary.
  854. self.boundary_chars = frozenset(self.boundary)
  855. # We also create a lookbehind list.
  856. # Note: the +8 is since we can have, at maximum, "\r\n--" + boundary +
  857. # "--\r\n" at the final boundary, and the length of '\r\n--' and
  858. # '--\r\n' is 8 bytes.
  859. self.lookbehind = [NULL for x in range(len(boundary) + 8)]
  860. def write(self, data):
  861. """Write some data to the parser, which will perform size verification,
  862. and then parse the data into the appropriate location (e.g. header,
  863. data, etc.), and pass this on to the underlying callback. If an error
  864. is encountered, a MultipartParseError will be raised. The "offset"
  865. attribute on the raised exception will be set to the offset of the byte
  866. in the input chunk that caused the error.
  867. :param data: a bytestring
  868. """
  869. # Handle sizing.
  870. data_len = len(data)
  871. if (self._current_size + data_len) > self.max_size:
  872. # We truncate the length of data that we are to process.
  873. new_size = int(self.max_size - self._current_size)
  874. self.logger.warning("Current size is %d (max %d), so truncating "
  875. "data length from %d to %d",
  876. self._current_size, self.max_size, data_len,
  877. new_size)
  878. data_len = new_size
  879. l = 0
  880. try:
  881. l = self._internal_write(data, data_len)
  882. finally:
  883. self._current_size += l
  884. return l
  885. def _internal_write(self, data, length):
  886. # Get values from locals.
  887. boundary = self.boundary
  888. # Get our state, flags and index. These are persisted between calls to
  889. # this function.
  890. state = self.state
  891. index = self.index
  892. flags = self.flags
  893. # Our index defaults to 0.
  894. i = 0
  895. # Set a mark.
  896. def set_mark(name):
  897. self.marks[name] = i
  898. # Remove a mark.
  899. def delete_mark(name, reset=False):
  900. self.marks.pop(name, None)
  901. # Helper function that makes calling a callback with data easier. The
  902. # 'remaining' parameter will callback from the marked value until the
  903. # end of the buffer, and reset the mark, instead of deleting it. This
  904. # is used at the end of the function to call our callbacks with any
  905. # remaining data in this chunk.
  906. def data_callback(name, remaining=False):
  907. marked_index = self.marks.get(name)
  908. if marked_index is None:
  909. return
  910. # If we're getting remaining data, we ignore the current i value
  911. # and just call with the remaining data.
  912. if remaining:
  913. self.callback(name, data, marked_index, length)
  914. self.marks[name] = 0
  915. # Otherwise, we call it from the mark to the current byte we're
  916. # processing.
  917. else:
  918. self.callback(name, data, marked_index, i)
  919. self.marks.pop(name, None)
  920. # For each byte...
  921. while i < length:
  922. c = data[i]
  923. if state == STATE_START:
  924. # Skip leading newlines
  925. if c == CR or c == LF:
  926. i += 1
  927. self.logger.debug("Skipping leading CR/LF at %d", i)
  928. continue
  929. # index is used as in index into our boundary. Set to 0.
  930. index = 0
  931. # Move to the next state, but decrement i so that we re-process
  932. # this character.
  933. state = STATE_START_BOUNDARY
  934. i -= 1
  935. elif state == STATE_START_BOUNDARY:
  936. # Check to ensure that the last 2 characters in our boundary
  937. # are CRLF.
  938. if index == len(boundary) - 2:
  939. if c != CR:
  940. # Error!
  941. msg = "Did not find CR at end of boundary (%d)" % (i,)
  942. self.logger.warning(msg)
  943. e = MultipartParseError(msg)
  944. e.offset = i
  945. raise e
  946. index += 1
  947. elif index == len(boundary) - 2 + 1:
  948. if c != LF:
  949. msg = "Did not find LF at end of boundary (%d)" % (i,)
  950. self.logger.warning(msg)
  951. e = MultipartParseError(msg)
  952. e.offset = i
  953. raise e
  954. # The index is now used for indexing into our boundary.
  955. index = 0
  956. # Callback for the start of a part.
  957. self.callback('part_begin')
  958. # Move to the next character and state.
  959. state = STATE_HEADER_FIELD_START
  960. else:
  961. # Check to ensure our boundary matches
  962. if c != boundary[index + 2]:
  963. msg = "Did not find boundary character %r at index " \
  964. "%d" % (c, index + 2)
  965. self.logger.warning(msg)
  966. e = MultipartParseError(msg)
  967. e.offset = i
  968. raise e
  969. # Increment index into boundary and continue.
  970. index += 1
  971. elif state == STATE_HEADER_FIELD_START:
  972. # Mark the start of a header field here, reset the index, and
  973. # continue parsing our header field.
  974. index = 0
  975. # Set a mark of our header field.
  976. set_mark('header_field')
  977. # Move to parsing header fields.
  978. state = STATE_HEADER_FIELD
  979. i -= 1
  980. elif state == STATE_HEADER_FIELD:
  981. # If we've reached a CR at the beginning of a header, it means
  982. # that we've reached the second of 2 newlines, and so there are
  983. # no more headers to parse.
  984. if c == CR:
  985. delete_mark('header_field')
  986. state = STATE_HEADERS_ALMOST_DONE
  987. i += 1
  988. continue
  989. # Increment our index in the header.
  990. index += 1
  991. # Do nothing if we encounter a hyphen.
  992. if c == HYPHEN:
  993. pass
  994. # If we've reached a colon, we're done with this header.
  995. elif c == COLON:
  996. # A 0-length header is an error.
  997. if index == 1:
  998. msg = "Found 0-length header at %d" % (i,)
  999. self.logger.warning(msg)
  1000. e = MultipartParseError(msg)
  1001. e.offset = i
  1002. raise e
  1003. # Call our callback with the header field.
  1004. data_callback('header_field')
  1005. # Move to parsing the header value.
  1006. state = STATE_HEADER_VALUE_START
  1007. else:
  1008. # Lower-case this character, and ensure that it is in fact
  1009. # a valid letter. If not, it's an error.
  1010. cl = lower_char(c)
  1011. if cl < LOWER_A or cl > LOWER_Z:
  1012. msg = "Found non-alphanumeric character %r in " \
  1013. "header at %d" % (c, i)
  1014. self.logger.warning(msg)
  1015. e = MultipartParseError(msg)
  1016. e.offset = i
  1017. raise e
  1018. elif state == STATE_HEADER_VALUE_START:
  1019. # Skip leading spaces.
  1020. if c == SPACE:
  1021. i += 1
  1022. continue
  1023. # Mark the start of the header value.
  1024. set_mark('header_value')
  1025. # Move to the header-value state, reprocessing this character.
  1026. state = STATE_HEADER_VALUE
  1027. i -= 1
  1028. elif state == STATE_HEADER_VALUE:
  1029. # If we've got a CR, we're nearly done our headers. Otherwise,
  1030. # we do nothing and just move past this character.
  1031. if c == CR:
  1032. data_callback('header_value')
  1033. self.callback('header_end')
  1034. state = STATE_HEADER_VALUE_ALMOST_DONE
  1035. elif state == STATE_HEADER_VALUE_ALMOST_DONE:
  1036. # The last character should be a LF. If not, it's an error.
  1037. if c != LF:
  1038. msg = "Did not find LF character at end of header " \
  1039. "(found %r)" % (c,)
  1040. self.logger.warning(msg)
  1041. e = MultipartParseError(msg)
  1042. e.offset = i
  1043. raise e
  1044. # Move back to the start of another header. Note that if that
  1045. # state detects ANOTHER newline, it'll trigger the end of our
  1046. # headers.
  1047. state = STATE_HEADER_FIELD_START
  1048. elif state == STATE_HEADERS_ALMOST_DONE:
  1049. # We're almost done our headers. This is reached when we parse
  1050. # a CR at the beginning of a header, so our next character
  1051. # should be a LF, or it's an error.
  1052. if c != LF:
  1053. msg = f"Did not find LF at end of headers (found {c!r})"
  1054. self.logger.warning(msg)
  1055. e = MultipartParseError(msg)
  1056. e.offset = i
  1057. raise e
  1058. self.callback('headers_finished')
  1059. state = STATE_PART_DATA_START
  1060. elif state == STATE_PART_DATA_START:
  1061. # Mark the start of our part data.
  1062. set_mark('part_data')
  1063. # Start processing part data, including this character.
  1064. state = STATE_PART_DATA
  1065. i -= 1
  1066. elif state == STATE_PART_DATA:
  1067. # We're processing our part data right now. During this, we
  1068. # need to efficiently search for our boundary, since any data
  1069. # on any number of lines can be a part of the current data.
  1070. # We use the Boyer-Moore-Horspool algorithm to efficiently
  1071. # search through the remainder of the buffer looking for our
  1072. # boundary.
  1073. # Save the current value of our index. We use this in case we
  1074. # find part of a boundary, but it doesn't match fully.
  1075. prev_index = index
  1076. # Set up variables.
  1077. boundary_length = len(boundary)
  1078. boundary_end = boundary_length - 1
  1079. data_length = length
  1080. boundary_chars = self.boundary_chars
  1081. # If our index is 0, we're starting a new part, so start our
  1082. # search.
  1083. if index == 0:
  1084. # Search forward until we either hit the end of our buffer,
  1085. # or reach a character that's in our boundary.
  1086. i += boundary_end
  1087. while i < data_length - 1 and data[i] not in boundary_chars:
  1088. i += boundary_length
  1089. # Reset i back the length of our boundary, which is the
  1090. # earliest possible location that could be our match (i.e.
  1091. # if we've just broken out of our loop since we saw the
  1092. # last character in our boundary)
  1093. i -= boundary_end
  1094. c = data[i]
  1095. # Now, we have a couple of cases here. If our index is before
  1096. # the end of the boundary...
  1097. if index < boundary_length:
  1098. # If the character matches...
  1099. if boundary[index] == c:
  1100. # If we found a match for our boundary, we send the
  1101. # existing data.
  1102. if index == 0:
  1103. data_callback('part_data')
  1104. # The current character matches, so continue!
  1105. index += 1
  1106. else:
  1107. index = 0
  1108. # Our index is equal to the length of our boundary!
  1109. elif index == boundary_length:
  1110. # First we increment it.
  1111. index += 1
  1112. # Now, if we've reached a newline, we need to set this as
  1113. # the potential end of our boundary.
  1114. if c == CR:
  1115. flags |= FLAG_PART_BOUNDARY
  1116. # Otherwise, if this is a hyphen, we might be at the last
  1117. # of all boundaries.
  1118. elif c == HYPHEN:
  1119. flags |= FLAG_LAST_BOUNDARY
  1120. # Otherwise, we reset our index, since this isn't either a
  1121. # newline or a hyphen.
  1122. else:
  1123. index = 0
  1124. # Our index is right after the part boundary, which should be
  1125. # a LF.
  1126. elif index == boundary_length + 1:
  1127. # If we're at a part boundary (i.e. we've seen a CR
  1128. # character already)...
  1129. if flags & FLAG_PART_BOUNDARY:
  1130. # We need a LF character next.
  1131. if c == LF:
  1132. # Unset the part boundary flag.
  1133. flags &= (~FLAG_PART_BOUNDARY)
  1134. # Callback indicating that we've reached the end of
  1135. # a part, and are starting a new one.
  1136. self.callback('part_end')
  1137. self.callback('part_begin')
  1138. # Move to parsing new headers.
  1139. index = 0
  1140. state = STATE_HEADER_FIELD_START
  1141. i += 1
  1142. continue
  1143. # We didn't find an LF character, so no match. Reset
  1144. # our index and clear our flag.
  1145. index = 0
  1146. flags &= (~FLAG_PART_BOUNDARY)
  1147. # Otherwise, if we're at the last boundary (i.e. we've
  1148. # seen a hyphen already)...
  1149. elif flags & FLAG_LAST_BOUNDARY:
  1150. # We need a second hyphen here.
  1151. if c == HYPHEN:
  1152. # Callback to end the current part, and then the
  1153. # message.
  1154. self.callback('part_end')
  1155. self.callback('end')
  1156. state = STATE_END
  1157. else:
  1158. # No match, so reset index.
  1159. index = 0
  1160. # If we have an index, we need to keep this byte for later, in
  1161. # case we can't match the full boundary.
  1162. if index > 0:
  1163. self.lookbehind[index - 1] = c
  1164. # Otherwise, our index is 0. If the previous index is not, it
  1165. # means we reset something, and we need to take the data we
  1166. # thought was part of our boundary and send it along as actual
  1167. # data.
  1168. elif prev_index > 0:
  1169. # Callback to write the saved data.
  1170. lb_data = join_bytes(self.lookbehind)
  1171. self.callback('part_data', lb_data, 0, prev_index)
  1172. # Overwrite our previous index.
  1173. prev_index = 0
  1174. # Re-set our mark for part data.
  1175. set_mark('part_data')
  1176. # Re-consider the current character, since this could be
  1177. # the start of the boundary itself.
  1178. i -= 1
  1179. elif state == STATE_END:
  1180. # Do nothing and just consume a byte in the end state.
  1181. if c not in (CR, LF):
  1182. self.logger.warning("Consuming a byte '0x%x' in the end state", c)
  1183. else: # pragma: no cover (error case)
  1184. # We got into a strange state somehow! Just stop processing.
  1185. msg = "Reached an unknown state %d at %d" % (state, i)
  1186. self.logger.warning(msg)
  1187. e = MultipartParseError(msg)
  1188. e.offset = i
  1189. raise e
  1190. # Move to the next byte.
  1191. i += 1
  1192. # We call our callbacks with any remaining data. Note that we pass
  1193. # the 'remaining' flag, which sets the mark back to 0 instead of
  1194. # deleting it, if it's found. This is because, if the mark is found
  1195. # at this point, we assume that there's data for one of these things
  1196. # that has been parsed, but not yet emitted. And, as such, it implies
  1197. # that we haven't yet reached the end of this 'thing'. So, by setting
  1198. # the mark to 0, we cause any data callbacks that take place in future
  1199. # calls to this function to start from the beginning of that buffer.
  1200. data_callback('header_field', True)
  1201. data_callback('header_value', True)
  1202. data_callback('part_data', True)
  1203. # Save values to locals.
  1204. self.state = state
  1205. self.index = index
  1206. self.flags = flags
  1207. # Return our data length to indicate no errors, and that we processed
  1208. # all of it.
  1209. return length
  1210. def finalize(self):
  1211. """Finalize this parser, which signals to that we are finished parsing.
  1212. Note: It does not currently, but in the future, it will verify that we
  1213. are in the final state of the parser (i.e. the end of the multipart
  1214. message is well-formed), and, if not, throw an error.
  1215. """
  1216. # TODO: verify that we're in the state STATE_END, otherwise throw an
  1217. # error or otherwise state that we're not finished parsing.
  1218. pass
  1219. def __repr__(self):
  1220. return f"{self.__class__.__name__}(boundary={self.boundary!r})"
  1221. class FormParser:
  1222. """This class is the all-in-one form parser. Given all the information
  1223. necessary to parse a form, it will instantiate the correct parser, create
  1224. the proper :class:`Field` and :class:`File` classes to store the data that
  1225. is parsed, and call the two given callbacks with each field and file as
  1226. they become available.
  1227. :param content_type: The Content-Type of the incoming request. This is
  1228. used to select the appropriate parser.
  1229. :param on_field: The callback to call when a field has been parsed and is
  1230. ready for usage. See above for parameters.
  1231. :param on_file: The callback to call when a file has been parsed and is
  1232. ready for usage. See above for parameters.
  1233. :param on_end: An optional callback to call when all fields and files in a
  1234. request has been parsed. Can be None.
  1235. :param boundary: If the request is a multipart/form-data request, this
  1236. should be the boundary of the request, as given in the
  1237. Content-Type header, as a bytestring.
  1238. :param file_name: If the request is of type application/octet-stream, then
  1239. the body of the request will not contain any information
  1240. about the uploaded file. In such cases, you can provide
  1241. the file name of the uploaded file manually.
  1242. :param FileClass: The class to use for uploaded files. Defaults to
  1243. :class:`File`, but you can provide your own class if you
  1244. wish to customize behaviour. The class will be
  1245. instantiated as FileClass(file_name, field_name), and it
  1246. must provide the following functions::
  1247. file_instance.write(data)
  1248. file_instance.finalize()
  1249. file_instance.close()
  1250. :param FieldClass: The class to use for uploaded fields. Defaults to
  1251. :class:`Field`, but you can provide your own class if
  1252. you wish to customize behaviour. The class will be
  1253. instantiated as FieldClass(field_name), and it must
  1254. provide the following functions::
  1255. field_instance.write(data)
  1256. field_instance.finalize()
  1257. field_instance.close()
  1258. :param config: Configuration to use for this FormParser. The default
  1259. values are taken from the DEFAULT_CONFIG value, and then
  1260. any keys present in this dictionary will overwrite the
  1261. default values.
  1262. """
  1263. #: This is the default configuration for our form parser.
  1264. #: Note: all file sizes should be in bytes.
  1265. DEFAULT_CONFIG = {
  1266. 'MAX_BODY_SIZE': float('inf'),
  1267. 'MAX_MEMORY_FILE_SIZE': 1 * 1024 * 1024,
  1268. 'UPLOAD_DIR': None,
  1269. 'UPLOAD_KEEP_FILENAME': False,
  1270. 'UPLOAD_KEEP_EXTENSIONS': False,
  1271. # Error on invalid Content-Transfer-Encoding?
  1272. 'UPLOAD_ERROR_ON_BAD_CTE': False,
  1273. }
  1274. def __init__(self, content_type, on_field, on_file, on_end=None,
  1275. boundary=None, file_name=None, FileClass=File,
  1276. FieldClass=Field, config={}):
  1277. self.logger = logging.getLogger(__name__)
  1278. # Save variables.
  1279. self.content_type = content_type
  1280. self.boundary = boundary
  1281. self.bytes_received = 0
  1282. self.parser = None
  1283. # Save callbacks.
  1284. self.on_field = on_field
  1285. self.on_file = on_file
  1286. self.on_end = on_end
  1287. # Save classes.
  1288. self.FileClass = File
  1289. self.FieldClass = Field
  1290. # Set configuration options.
  1291. self.config = self.DEFAULT_CONFIG.copy()
  1292. self.config.update(config)
  1293. # Depending on the Content-Type, we instantiate the correct parser.
  1294. if content_type == 'application/octet-stream':
  1295. # Work around the lack of 'nonlocal' in Py2
  1296. class vars:
  1297. f = None
  1298. def on_start():
  1299. vars.f = FileClass(file_name, None, config=self.config)
  1300. def on_data(data, start, end):
  1301. vars.f.write(data[start:end])
  1302. def on_end():
  1303. # Finalize the file itself.
  1304. vars.f.finalize()
  1305. # Call our callback.
  1306. on_file(vars.f)
  1307. # Call the on-end callback.
  1308. if self.on_end is not None:
  1309. self.on_end()
  1310. callbacks = {
  1311. 'on_start': on_start,
  1312. 'on_data': on_data,
  1313. 'on_end': on_end,
  1314. }
  1315. # Instantiate an octet-stream parser
  1316. parser = OctetStreamParser(callbacks,
  1317. max_size=self.config['MAX_BODY_SIZE'])
  1318. elif (content_type == 'application/x-www-form-urlencoded' or
  1319. content_type == 'application/x-url-encoded'):
  1320. name_buffer = []
  1321. class vars:
  1322. f = None
  1323. def on_field_start():
  1324. pass
  1325. def on_field_name(data, start, end):
  1326. name_buffer.append(data[start:end])
  1327. def on_field_data(data, start, end):
  1328. if vars.f is None:
  1329. vars.f = FieldClass(b''.join(name_buffer))
  1330. del name_buffer[:]
  1331. vars.f.write(data[start:end])
  1332. def on_field_end():
  1333. # Finalize and call callback.
  1334. if vars.f is None:
  1335. # If we get here, it's because there was no field data.
  1336. # We create a field, set it to None, and then continue.
  1337. vars.f = FieldClass(b''.join(name_buffer))
  1338. del name_buffer[:]
  1339. vars.f.set_none()
  1340. vars.f.finalize()
  1341. on_field(vars.f)
  1342. vars.f = None
  1343. def on_end():
  1344. if self.on_end is not None:
  1345. self.on_end()
  1346. # Setup callbacks.
  1347. callbacks = {
  1348. 'on_field_start': on_field_start,
  1349. 'on_field_name': on_field_name,
  1350. 'on_field_data': on_field_data,
  1351. 'on_field_end': on_field_end,
  1352. 'on_end': on_end,
  1353. }
  1354. # Instantiate parser.
  1355. parser = QuerystringParser(
  1356. callbacks=callbacks,
  1357. max_size=self.config['MAX_BODY_SIZE']
  1358. )
  1359. elif content_type == 'multipart/form-data':
  1360. if boundary is None:
  1361. self.logger.error("No boundary given")
  1362. raise FormParserError("No boundary given")
  1363. header_name = []
  1364. header_value = []
  1365. headers = {}
  1366. # No 'nonlocal' on Python 2 :-(
  1367. class vars:
  1368. f = None
  1369. writer = None
  1370. is_file = False
  1371. def on_part_begin():
  1372. pass
  1373. def on_part_data(data, start, end):
  1374. bytes_processed = vars.writer.write(data[start:end])
  1375. # TODO: check for error here.
  1376. return bytes_processed
  1377. def on_part_end():
  1378. vars.f.finalize()
  1379. if vars.is_file:
  1380. on_file(vars.f)
  1381. else:
  1382. on_field(vars.f)
  1383. def on_header_field(data, start, end):
  1384. header_name.append(data[start:end])
  1385. def on_header_value(data, start, end):
  1386. header_value.append(data[start:end])
  1387. def on_header_end():
  1388. headers[b''.join(header_name)] = b''.join(header_value)
  1389. del header_name[:]
  1390. del header_value[:]
  1391. def on_headers_finished():
  1392. # Reset the 'is file' flag.
  1393. vars.is_file = False
  1394. # Parse the content-disposition header.
  1395. # TODO: handle mixed case
  1396. content_disp = headers.get(b'Content-Disposition')
  1397. disp, options = parse_options_header(content_disp)
  1398. # Get the field and filename.
  1399. field_name = options.get(b'name')
  1400. file_name = options.get(b'filename')
  1401. # TODO: check for errors
  1402. # Create the proper class.
  1403. if file_name is None:
  1404. vars.f = FieldClass(field_name)
  1405. else:
  1406. vars.f = FileClass(file_name, field_name, config=self.config)
  1407. vars.is_file = True
  1408. # Parse the given Content-Transfer-Encoding to determine what
  1409. # we need to do with the incoming data.
  1410. # TODO: check that we properly handle 8bit / 7bit encoding.
  1411. transfer_encoding = headers.get(b'Content-Transfer-Encoding',
  1412. b'7bit')
  1413. if (transfer_encoding == b'binary' or
  1414. transfer_encoding == b'8bit' or
  1415. transfer_encoding == b'7bit'):
  1416. vars.writer = vars.f
  1417. elif transfer_encoding == b'base64':
  1418. vars.writer = Base64Decoder(vars.f)
  1419. elif transfer_encoding == b'quoted-printable':
  1420. vars.writer = QuotedPrintableDecoder(vars.f)
  1421. else:
  1422. self.logger.warning("Unknown Content-Transfer-Encoding: "
  1423. "%r", transfer_encoding)
  1424. if self.config['UPLOAD_ERROR_ON_BAD_CTE']:
  1425. raise FormParserError(
  1426. 'Unknown Content-Transfer-Encoding "{}"'.format(
  1427. transfer_encoding
  1428. )
  1429. )
  1430. else:
  1431. # If we aren't erroring, then we just treat this as an
  1432. # unencoded Content-Transfer-Encoding.
  1433. vars.writer = vars.f
  1434. def on_end():
  1435. vars.writer.finalize()
  1436. if self.on_end is not None:
  1437. self.on_end()
  1438. # These are our callbacks for the parser.
  1439. callbacks = {
  1440. 'on_part_begin': on_part_begin,
  1441. 'on_part_data': on_part_data,
  1442. 'on_part_end': on_part_end,
  1443. 'on_header_field': on_header_field,
  1444. 'on_header_value': on_header_value,
  1445. 'on_header_end': on_header_end,
  1446. 'on_headers_finished': on_headers_finished,
  1447. 'on_end': on_end,
  1448. }
  1449. # Instantiate a multipart parser.
  1450. parser = MultipartParser(boundary, callbacks,
  1451. max_size=self.config['MAX_BODY_SIZE'])
  1452. else:
  1453. self.logger.warning("Unknown Content-Type: %r", content_type)
  1454. raise FormParserError("Unknown Content-Type: {}".format(
  1455. content_type
  1456. ))
  1457. self.parser = parser
  1458. def write(self, data):
  1459. """Write some data. The parser will forward this to the appropriate
  1460. underlying parser.
  1461. :param data: a bytestring
  1462. """
  1463. self.bytes_received += len(data)
  1464. # TODO: check the parser's return value for errors?
  1465. return self.parser.write(data)
  1466. def finalize(self):
  1467. """Finalize the parser."""
  1468. if self.parser is not None and hasattr(self.parser, 'finalize'):
  1469. self.parser.finalize()
  1470. def close(self):
  1471. """Close the parser."""
  1472. if self.parser is not None and hasattr(self.parser, 'close'):
  1473. self.parser.close()
  1474. def __repr__(self):
  1475. return "{}(content_type={!r}, parser={!r})".format(
  1476. self.__class__.__name__,
  1477. self.content_type,
  1478. self.parser,
  1479. )
  1480. def create_form_parser(headers, on_field, on_file, trust_x_headers=False,
  1481. config={}):
  1482. """This function is a helper function to aid in creating a FormParser
  1483. instances. Given a dictionary-like headers object, it will determine
  1484. the correct information needed, instantiate a FormParser with the
  1485. appropriate values and given callbacks, and then return the corresponding
  1486. parser.
  1487. :param headers: A dictionary-like object of HTTP headers. The only
  1488. required header is Content-Type.
  1489. :param on_field: Callback to call with each parsed field.
  1490. :param on_file: Callback to call with each parsed file.
  1491. :param trust_x_headers: Whether or not to trust information received from
  1492. certain X-Headers - for example, the file name from
  1493. X-File-Name.
  1494. :param config: Configuration variables to pass to the FormParser.
  1495. """
  1496. content_type = headers.get('Content-Type')
  1497. if content_type is None:
  1498. logging.getLogger(__name__).warning("No Content-Type header given")
  1499. raise ValueError("No Content-Type header given!")
  1500. # Boundaries are optional (the FormParser will raise if one is needed
  1501. # but not given).
  1502. content_type, params = parse_options_header(content_type)
  1503. boundary = params.get(b'boundary')
  1504. # We need content_type to be a string, not a bytes object.
  1505. content_type = content_type.decode('latin-1')
  1506. # File names are optional.
  1507. file_name = headers.get('X-File-Name')
  1508. # Instantiate a form parser.
  1509. form_parser = FormParser(content_type,
  1510. on_field,
  1511. on_file,
  1512. boundary=boundary,
  1513. file_name=file_name,
  1514. config=config)
  1515. # Return our parser.
  1516. return form_parser
  1517. def parse_form(headers, input_stream, on_field, on_file, chunk_size=1048576,
  1518. **kwargs):
  1519. """This function is useful if you just want to parse a request body,
  1520. without too much work. Pass it a dictionary-like object of the request's
  1521. headers, and a file-like object for the input stream, along with two
  1522. callbacks that will get called whenever a field or file is parsed.
  1523. :param headers: A dictionary-like object of HTTP headers. The only
  1524. required header is Content-Type.
  1525. :param input_stream: A file-like object that represents the request body.
  1526. The read() method must return bytestrings.
  1527. :param on_field: Callback to call with each parsed field.
  1528. :param on_file: Callback to call with each parsed file.
  1529. :param chunk_size: The maximum size to read from the input stream and write
  1530. to the parser at one time. Defaults to 1 MiB.
  1531. """
  1532. # Create our form parser.
  1533. parser = create_form_parser(headers, on_field, on_file)
  1534. # Read chunks of 100KiB and write to the parser, but never read more than
  1535. # the given Content-Length, if any.
  1536. content_length = headers.get('Content-Length')
  1537. if content_length is not None:
  1538. content_length = int(content_length)
  1539. else:
  1540. content_length = float('inf')
  1541. bytes_read = 0
  1542. while True:
  1543. # Read only up to the Content-Length given.
  1544. max_readable = min(content_length - bytes_read, 1048576)
  1545. buff = input_stream.read(max_readable)
  1546. # Write to the parser and update our length.
  1547. parser.write(buff)
  1548. bytes_read += len(buff)
  1549. # If we get a buffer that's smaller than the size requested, or if we
  1550. # have read up to our content length, we're done.
  1551. if len(buff) != max_readable or bytes_read == content_length:
  1552. break
  1553. # Tell our parser that we're done writing data.
  1554. parser.finalize()