Logo Search packages:      
Sourcecode: aegis version File versions

quoted_print.c

/*
 *    aegis - project change supervisor
 *    Copyright (C) 2001, 2002 Peter Miller;
 *    All rights reserved.
 *
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA.
 *
 * MANIFEST: functions to manipulate quoted_prints
 *
 * From RFC 1521...
 *
 *    The Quoted-Printable encoding is intended to represent data
 *    that largely consists of octets that correspond to printable
 *    characters in the ASCII character set. It encodes the data in
 *    such a way that the resulting octets are unlikely to be modified
 *    by mail transport. If the data being encoded are mostly ASCII
 *    text, the encoded form of the data remains largely recognizable
 *    by humans. A body which is entirely ASCII may also be encoded
 *    in Quoted-Printable to ensure the integrity of the data should
 *    the message pass through a character- translating, and/or
 *    line-wrapping gateway.
 *
 *    In this encoding, octets are to be represented as determined by
 *    the following rules:
 *
 *    Rule #1: (General 8-bit representation) Any octet, except those
 *    indicating a line break according to the newline convention
 *    of the canonical (standard) form of the data being encoded,
 *    may be represented by an "=" followed by a two digit hexadecimal
 *    representation of the octet's value. The digits of the hexadecimal
 *    alphabet, for this purpose, are "0123456789ABCDEF". Uppercase
 *    letters must be used when sending hexadecimal data, though a
 *    robust implementation may choose to recognize lowercase letters
 *    on receipt. Thus, for example, the value 12 (ASCII form feed)
 *    can be represented by "=0C", and the value 61 (ASCII EQUAL SIGN)
 *    can be represented by "=3D". Except when the following rules
 *    allow an alternative encoding, this rule is mandatory.
 *
 *    Rule #2: (Literal representation) Octets with decimal values of
 *    33 through 60 inclusive, and 62 through 126, inclusive, MAY be
 *    represented as the ASCII characters which correspond to those
 *    octets (EXCLAMATION POINT through LESS THAN, and GREATER THAN
 *    through TILDE, respectively).
 *
 *    Rule #3: (White Space): Octets with values of 9 and 32 MAY be
 *    represented as ASCII TAB (HT) and SPACE characters, respectively,
 *    but MUST NOT be so represented at the end of an encoded line. Any
 *    TAB (HT) or SPACE characters on an encoded line MUST thus be
 *    followed on that line by a printable character. In particular,
 *    an "=" at the end of an encoded line, indicating a soft line
 *    break (see rule #5) may follow one or more TAB (HT) or SPACE
 *    characters. It follows that an octet with value 9 or 32 appearing
 *    at the end of an encoded line must be represented according
 *    to Rule #1. This rule is necessary because some MTAs (Message
 *    Transport Agents, programs which transport messages from one
 *    user to another, or perform a part of such transfers) are known
 *    to pad lines of text with SPACEs, and others are known to remove
 *    "white space" characters from the end of a line. Therefore, when
 *    decoding a Quoted-Printable body, any trailing white space on
 *    a line must be deleted, as it will necessarily have been added
 *    by intermediate transport agents.
 *
 *    Rule #4 (Line Breaks): A line break in a text body, independent of
 *    what its representation is following the canonical representation
 *    of the data being encoded, must be represented by a (RFC 822)
 *    line break, which is a CRLF sequence, in the Quoted-Printable
 *    encoding. Since the canonical representation of types other than
 *    text do not generally include the representation of line breaks,
 *    no hard line breaks (i.e.  line breaks that are intended to
 *    be meaningful and to be displayed to the user) should occur
 *    in the quoted-printable encoding of such types. Of course,
 *    occurrences of "=0D", "=0A", "0A=0D" and "=0D=0A" will eventually
 *    be encountered. In general, however, base64 is preferred over
 *    quoted-printable for binary data.
 *
 *    Note that many implementations may elect to encode the local
 *    representation of various content types directly, as described
 *    in Appendix G. In particular, this may apply to plain text
 *    material on systems that use newline conventions other than
 *    CRLF delimiters. Such an implementation is permissible, but the
 *    generation of line breaks must be generalized to account for
 *    the case where alternate representations of newline sequences
 *    are used.
 *
 *    Rule #5 (Soft Line Breaks): The Quoted-Printable encoding REQUIRES
 *    that encoded lines be no more than 76 characters long. If longer
 *    lines are to be encoded with the Quoted-Printable encoding,
 *    'soft' line breaks must be used. An equal sign as the last
 *    character on a encoded line indicates such a non-significant
 *    ('soft') line break in the encoded text. Thus if the "raw"
 *    form of the line is a single unencoded line that says:
 *
 *          Now's the time for all folk to come to the aid of their country.
 *
 *    This can be represented, in the Quoted-Printable encoding, as
 *
 *          Now's the time =
 *          for all folk to come =
 *          to the aid of their country.
 *
 *    This provides a mechanism with which long lines are encoded in
 *    such a way as to be restored by the user agent. The 76 character
 *    limit does not count the trailing CRLF, but counts all other
 *    characters, including any equal signs.
 *
 *    Since the hyphen character ("-") is represented as itself
 *    in the Quoted-Printable encoding, care must be taken, when
 *    encapsulating a quoted-printable encoded body in a multipart
 *    entity, to ensure that the encapsulation boundary does not
 *    appear anywhere in the encoded body. (A good strategy is to
 *    choose a boundary that includes a character sequence such as
 *    "=_" which can never appear in a quoted- printable body. See
 *    the definition of multipart messages later in this document.)
 *
 *    NOTE: The quoted-printable encoding represents something
 *    of a compromise between readability and reliability in
 *    transport. Bodies encoded with the quoted-printable encoding
 *    will work reliably over most mail gateways, but may not
 *    work perfectly over a few gateways, notably those involving
 *    translation into EBCDIC. (In theory, an EBCDIC gateway could
 *    decode a quoted-printable body and re-encode it using base64,
 *    but such gateways do not yet exist.) A higher level of confidence
 *    is offered by the base64 Content-Transfer-Encoding. A way to get
 *    reasonably reliable transport through EBCDIC gateways is to also
 *    quote the ASCII characters
 *
 *          !"#$@[\]^`{|}~
 *
 *    according to rule #1. See Appendix B for more information.
 *
 *    Because quoted-printable data is generally assumed to be line-
 *    oriented, it is to be expected that the representation of
 *    the breaks between the lines of quoted printable data may
 *    be altered in transport, in the same manner that plain text
 *    mail has always been altered in Internet mail when passing
 *    between systems with differing newline conventions. If such
 *    alterations are likely to constitute a corruption of the data,
 *    it is probably more sensible to use the base64 encoding rather
 *    than the quoted-printable encoding.
 *
 *    WARNING TO IMPLEMENTORS: If binary data are encoded in quoted-
 *    printable, care must be taken to encode CR and LF characters as
 *    "=0D" and "=0A", respectively. In particular, a CRLF sequence in
 *    binary data should be encoded as "=0D=0A". Otherwise, if CRLF
 *    were represented as a hard line break, it might be incorrectly
 *    decoded on platforms with different line break conventions.
 *
 *    For formalists, the syntax of quoted-printable data is described
 *    by the following grammar:
 *
 *    quoted-printable := ([*(ptext / SPACE / TAB) ptext] ["="] CRLF)
 *          ; Maximum line length of 76 characters excluding CRLF
 *
 *    ptext := octet /<any ASCII character except "=", SPACE, or TAB>
 *          ; characters not listed as "mail-safe" in Appendix B
 *          ; are also not recommended.
 *
 *    octet := "=" 2(DIGIT / "A" / "B" / "C" / "D" / "E" / "F")
 *          ; octet must be used for characters > 127, =, SPACE, or TAB,
 *          ; and is recommended for any characters not listed in
 *          ; Appendix B as "mail-safe".
 */

#include <ac/string.h>

#include <input/quoted_print.h>
#include <input/private.h>
#include <mem.h>


typedef struct input_base64_ty input_base64_ty;
struct input_base64_ty
{
    input_ty          inherited;
    input_ty          *deeper;
    int               close_on_close;
    int               eof;
    long        pos;
};


static void
input_quoted_printable_destructor(input_ty *fp)
{
    input_base64_ty *this;

    this = (input_base64_ty *)fp;
    if (this->close_on_close)
      input_delete(this->deeper);
    this->deeper = 0; /* paranoia */
}


static int
hex(int c)
{
    switch (c)
    {
    default:
      return -1;

    case '0':
    case '1':
    case '2':
    case '3':
    case '4':
    case '5':
    case '6':
    case '7':
    case '8':
    case '9':
      return (c - '0');

    case 'A':
    case 'B':
    case 'C':
    case 'D':
    case 'E':
    case 'F':
      return (c - 'A' + 10);

    case 'a':
    case 'b':
    case 'c':
    case 'd':
    case 'e':
    case 'f':
      return (c - 'a' + 10);
    }
}


static long
input_quoted_printable_read(input_ty *fp, void *data, size_t len)
{
    input_base64_ty *this;
    unsigned char   *cp;
    unsigned char   *end;
    long        nbytes;

    this = (input_base64_ty *)fp;
    if (this->eof)
      return 0;
    cp = data;
    end = cp + len;
    while (cp < end)
    {
      int         c;
      int         n1;
      int         n2;

      c = input_getc(this->deeper);
      if (c < 0)
      {
          this->eof = 1;
          break;
      }
      if (c == ' ' || c == '\t')
      {
          static char       *buffer;
          static size_t   bufmax;
          size_t      bufpos;
          size_t      nchars;

          /*
           * We are supposed to suppress white space on
           * the ends of lines.  This is because some
           * (non-unix, non-windows) mail transfer agents
           * add extra white space on the ends of lines.
           * (Our corresponding encoding escapes trailing
           * spaces and tabs.)
           */
          bufpos = 0;
          for (;;)
          {
            /*
             * Stash this character (we may need it later)
             */
            if (bufpos >= bufmax)
            {
                bufmax = bufmax * 2 + 8;
                buffer = mem_change_size(buffer, bufmax);
            }
            buffer[bufpos++] = c;

            /*
             * See what comes next.
             */
            c = input_getc(this->deeper);
            if (c < 0)
                break;
            if (c == '\n')
            {
                *cp++ = '\n';
                goto next_char;
            }
            if (c != ' ' && c != '\t')
            {
                input_ungetc(this->deeper, c);
                break;
            }
          }

          /*
           * Put as many of the buffered characters into
           * the output as possible.      This means we won't
           * double handle them (actually, we would O(n**2)
           * handle them).
           */
          nchars = end - cp;
          if (nchars > bufpos)
            nchars = bufpos;
          memcpy(cp, buffer, nchars);
          cp += nchars;

          /*
           * If there wasn't room, there is no help for it.
           * We will have to give the rest of the buffered
           * characters back.  Hopefully next time will
           * be big enough for all of them.  The
           * pathological case required >16KB of spaces and
           * tabs: unlikely.
           */
          while (bufpos > nchars)
          {
            --bufpos;
            input_ungetc(this->deeper, buffer[bufpos]);
          }

          /*
           * Don't fall into the next statement, but start
           * this loop from the top.      (We could have run
           * out of output buffer).
           */
          continue;
      }

      /*
       * If this isn't an escape sequence, return the literal
       * character.
       */
      if (c != '=')
      {
          *cp++ = c;
          continue;
      }

      /*
       * Grab two hex digits.  If they aren't hex digits,
       * it is a format error.
       *
       * Except for trailing white space; that we ignore.
       */
      c = input_getc(this->deeper);
      if (c < 0)
          break;
      if (c == ' ' || c == '\t')
      {
          for (;;)
          {
            c = input_getc(this->deeper);
            if (c == '\n')
                break;
            if (c != ' ' && c != '\t')
            {
                input_fatal_error
                (
                  this->deeper,
                  "quoted printable: invalid character"
                );
                /* NOTREACHED */
            }
          }
      }
      if (c == '\n')
          continue;
      n1 = hex(c);
      if (n1 < 0)
      {
          input_fatal_error(fp, "quoted printable: invalid hex character");
          /* NOTREACHED */
      }
      c = input_getc(this->deeper);
      n2 = hex(c);
      if (n2 < 0)
      {
          input_fatal_error(fp, "quoted printable: invalid hex character");
          /* NOTREACHED */
      }
      *cp++ = ((n1 << 4) | n2);
      next_char:
      ;
    }
    nbytes = (cp - (unsigned char *)data);
    this->pos += nbytes;
    return nbytes;
}


static long
input_quoted_printable_ftell(input_ty *deeper)
{
    input_base64_ty *this;

    this = (input_base64_ty *)deeper;
    return this->pos;
}


static struct string_ty *
input_quoted_printable_name(input_ty *fp)
{
    input_base64_ty *this;

    this = (input_base64_ty *)fp;
    return input_name(this->deeper);
}


static long
input_quoted_printable_length(input_ty *fp)
{
    return -1;
}


static input_vtbl_ty vtbl =
{
    sizeof(input_base64_ty),
    input_quoted_printable_destructor,
    input_quoted_printable_read,
    input_quoted_printable_ftell,
    input_quoted_printable_name,
    input_quoted_printable_length,
};


input_ty *
input_quoted_printable(input_ty *deeper, int coc)
{
    input_ty          *result;
    input_base64_ty *this;

    result = input_new(&vtbl);
    this = (input_base64_ty *)result;
    this->deeper = deeper;
    this->close_on_close = coc;
    this->eof = 0;
    this->pos = 0;
    return result;
}

Generated by  Doxygen 1.6.0   Back to index