DZone Snippets is a public source code repository. Easily build up your personal collection of code snippets, categorize them with tags / keywords, and share them with the world

Snippets has posted 5883 posts at DZone. View Full User Profile

Treetop PDF Grammar

05.14.2009
| 3452 views |
  • submit to reddit
        
grammar PDFGrammar

  rule file
    header (body separator* xref_section separator* trailer eol?)+
  end

  rule header
    comment
  end

  rule body
    (comment / object / separator)*
  end

  #---------------------------------------------
  # XREF SECTION
  #---------------------------------------------
  rule xref_section
    'xref' eol xref_subsection+
  end

  rule xref_subsection
    xref_section_header xref_entry+
  end

  rule xref_section_header
    # the separator* should be eol but some PDF files do not
    # follow the PDF spec (especially in object streams)
    xref_1st_object_number " " xref_entry_count separator*
  end

  rule xref_1st_object_number
    integer
  end

  rule xref_entry_count
    integer
  end

  rule xref_entry
    xref_offset " " xref_generation " " xref_in_use xref_eol
  end

  rule xref_offset
    [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9]
  end

  rule xref_generation
    [0-9] [0-9] [0-9] [0-9] [0-9]
  end

  rule xref_in_use
    [fn]
  end

  rule xref_eol
    "\r\n" / " \r" / " \n"
  end

  #---------------------------------------------
  # TRAILER
  #---------------------------------------------
  rule trailer
    'trailer' eol? trailer_dictionary eol? 'startxref' eol? xref_last_section_offset eol? '%%EOF'
  end

  rule trailer_dictionary
    dictionary
  end

  rule xref_last_section_offset
    integer
  end

  #---------------------------------------------
  # BODY OBJECTS
  #---------------------------------------------

  rule comment
    "%" ( !eol . )* eol
  end

  rule object
    composed_object / base_object
  end

  rule base_object
    null / boolean / number / string / name / array / dictionary
  end

  rule composed_object
    stream / indirect_object_definition / indirect_object_reference
  end

  #=================================================

  rule null
    'null'
  end

  rule boolean
    'true' / 'false'
  end

  rule number
    # caution !! order is important here: real must be before
    # integer or the integral part of a real could be interpreted
    # as a integer followed by a real starting with a '.' (dot)
    real / integer
  end
   
  rule integer
     '0' / sign? [1-9] [0-9]*
  end

  rule real
    sign? '.' [0-9]+ / sign? '0.' [0-9]+ / sign? [1-9] [0-9]* '.' [0-9]*
  end

  rule sign
    [+-]
  end

  #----------------------------------------
  # String
  #----------------------------------------

  rule string
    string_litteral / string_hexadecimal
  end

  rule string_litteral
    '(' (string_litteral / string_char / continuation)* ')'
  end

  rule string_char
    escape_sequence / [^)]
  end

  rule escape_sequence
    "\\n" / "\\r" / "\\t" / "\\b" / "\\f" / "\\(" / "\\)" / "\\\\" / octal_sequence
  end

  rule octal_sequence
    "\\" (octal_digit octal_digit octal_digit / octal_digit octal_digit / octal_digit)
  end

  rule string_hexadecimal
   '<' (two_hexa_digits / separator)* '>'
  end

  rule two_hexa_digits
    hexa_digit hexa_digit
  end

  rule hexa_digit
    [0-9A-Fa-f]
  end

  rule octal_digit
    [0-7]
  end

  #----------------------------------------
  # Name
  #----------------------------------------

  rule name
    '/' (regular_ASCII_char / two_digit_code)+
  end

  rule two_digit_code
    '#' two_digit
  end

  #----------------------------------------
  # Array
  #----------------------------------------

  rule array
    '[' (object / separator)* ']'
  end

  #----------------------------------------
  # Dictionary
  #----------------------------------------

  rule dictionary
    '<<' (separator* dictionary_entry separator*)* '>>'
  end

  rule dictionary_entry
    dictionary_key separator* dictionary_value
  end

  rule dictionary_key
    name
  end

  rule dictionary_value
    object
  end

  #----------------------------------------
  # Stream
  #----------------------------------------

  rule stream
    dictionary separator* 'stream' ("\r\n" / "\n")  ( !"endstream" . )* eol? 'endstream'
  end

  #----------------------------------------
  # Indirect object definition
  #----------------------------------------

  rule indirect_object_definition
    object_number separator* generation_number separator* 'obj' separator* object separator* 'endobj'
  end

  rule object_number
    integer
  end

  rule generation_number
    integer
  end

  #----------------------------------------
  # Indirect object reference
  #----------------------------------------

  rule indirect_object_reference
    object_number_ref separator* generation_number_ref separator* 'R'
  end

  rule object_number_ref
    integer
  end

  rule generation_number_ref
    integer
  end

  #----------------------------------------
  # Spaces, delimiters and characters
  #----------------------------------------

  rule eol
    "\r\n" / [\n\r]
  end

  rule white_space
    [\000\011\012\014\015\040]
  end

  rule continuation
    "\\" eol
  end

  rule separator
    white_space / eol / continuation
  end

  rule delimiter
    [()\[\]><{}%]
  end

  # All characters except the white-space characters and delimiters are referred to as regular characters.
  rule regular_char
    regular_ASCII_char / regular_non_ASCII_char
  end

  rule regular_ASCII_char
    [!"$&'*+,\-\.0-9:;=?@A-Z\\\^_`a-z|~]
  end

  rule regular_non_ASCII_char
    [\001\002\003\004\005\006\007\010\013\016\017\020\021\022\023\024\025\026\027\030\031\032\e\034\035\036\037\200\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217\220\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237\240\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337\340\341\342\343\344\345\346\347\350\351\352\353\354\355\356\357\360\361\362\363\364\365\366\367\370\371\372\373\374\375\376\377]
  end
end