(***********************************************************************
 *
 * SimpleTokenizer.pas version 1.0
 *
 * Copyright (c) 2013 Michel Deslierres
 * All rights reserved. Tous droits rservs.
 *
 * Read full license in the file license.txt
 * Lire la licence au complet dans le fichier licence.txt
 *
 ************************************************************************)

{: Unit that defines a tokenizer object that is used by the parsers
   to get individual parts of a mathematical expressions.}
unit SimpleTokenizer;

{$IFNDEF PARSER_INC}
  {$I Parser.inc}
{$ENDIF}

interface

uses
  SysUtils;

type
  TMdTokenType = (

    {: Value returned by NextToken or NextTrueToken when a token starting
       with an unknown character is found (unknown initial characters are
       any characters that are not a space (ASCII code 1..32), not a digit
       not an operator and not ASCII #0}
    ttUnknown,

    {: Value returned by NextToken or NextTrueToken when the end of the
       source has been reached. In that case TokenLength is 0.}
    ttEOS,

    {: Value returned by NextToken when a sequence of one or more
       consecutive white spaces has been found. This type is never
       returned by NextTrueToken.}
    ttWhite,

    {: Value returned by NextToken or NextTrueToken when a number has
       been found. A number has the following form
         1. - sequence of one or more digits
           1.1 - [optional] decimal symbol (see property DecimalChar)
             1.1.1 - [optional] sequence of digits
         2. [optional] "E" or "e"
           2.1 - [optional] "+" or "-"
           2.2 - sequence of one or more digits.}
    ttNumber,

    {: Value returned by NextToken or NextTrueToken when the operator
        "-" has been found.}
    ttMinus,

    {: Value returned by NextToken or NextTrueToken when the operator
       "+" has been found.}
    ttPlus,

    {: Value returned by NextToken or NextTrueToken when the operator
       "*" has been found.}
    ttMult,

    {: Value returned by NextToken or NextTrueToken when the operator
       "/" has been found.}
    ttDiv,

    {: Value returned by NextToken or NextTrueToken when the operator
       "(" has been found.}
    ttLeftParenthesis,

    {: Value returned by NextToken or NextTrueToken when the operator
       ")" has been found.}
    ttRightParenthesis
    );

type
    {: Basic single line tokenizer. This object scans a source (usually a
       single line) and breaks is up into tokens, sub strings of TMdTokenType.
       Typical usage would be
         1. create tokenizer.
         2. set source.
         3. repeatedly call NextToken (or NextTrueToken) until the end of the
            source is found handling each type of token.
         4. Free the tokenizer.
    }
  TMdTokenizer = class
  private
    FSource: string;
    FDecimalChar: char;
    FTokenType: TMdTokenType;
    FTokenStart: integer;   // Index of the current token's position
    FTokenLength: integer;  // The length of the current token.
    FNextCharPos: integer;  // Index of the character in source (after current token)
    function GetToken: string;
    procedure SetSource(const value: string);
  protected

      {: Called when parsing a token to increment FNextCharPos and
         FTokenLength by one. }
    procedure MoveByOneChar;

      {: Restarts scanning at the beginning of the source. Used in testing
         but there is probably no reason to call this procedure directly.}
    procedure Reset;

  public
    constructor create;

      {: Returns next token's type. Properties TokenStart, TokenLength,
         TokenType and Token are all adjusted. The protected variable
         FNextCharPos is updated to point to the first character in the
         source following the current token. The next call to NextToken
         will start from FNextCharPos.

         If all the source has been scanned then NextToken returns ttEOS
         and
           TokenStart = length(Source) + 1
           TokenLength = 0
           TokenType = ttEOS
           Token = ''. }
    function NextToken: TMdTokenType; virtual;

      {: Returns the next token that is not white space.}
    function NextTrueToken: TMdTokenType;

      {: Character that separates decimals from integers in floating values.
         This is set to the system's locale value when the object is
         created.}
    property DecimalChar: char read FDecimalChar write FDecimalChar;

      {: String containing the source to be scanned. Whenever the source
         is set, the Reset method is called.}
    property Source: string read FSource write SetSource;

      {: Index into Source of the first character of the current token.
         Equal to length(Source) + 1 if at end of source.}
    property TokenStart: integer read FTokenStart;

      {: Length of the current token, 0 if at end of source.}
    property TokenLength: integer read FTokenLength;

      {: The current token as a string.}
    property Token: string read GetToken;

      {: The type of the current token.}
    property TokenType: TMdTokenType read FTokenType;
  end;

implementation

{ Utility functions }

function isWhite(c: char): boolean;
begin
  result := c <= ' ';
end;

function isDigit(c: char): boolean;
begin
  result := (c >= '0') and (c <= '9');
end;

{ TMdTokenizer }

constructor TMdTokenizer.create;
begin
  inherited Create;
  FDecimalChar := DecimalSeparator;
  Reset;
end;

function TMdTokenizer.GetToken: string;
begin
  result := copy(FSource, FTokenStart, FTokenLength)
end;

procedure TMdTokenizer.MoveByOneChar;
begin
  inc(FNextCharPos);
  inc(FTokenLength);
end;

function TMdTokenizer.NextTrueToken: TMdTokenType;
begin
  repeat
    result := NextToken;
  until result <> ttWhite;
end;

function TMdTokenizer.NextToken: TMdTokenType;

  {There are three basic types of tokens: white space, numbers and operators.
   Hence one of the following three procedures is called on the basis of the
   character as FNextCharPos. Each procedure attempts to accumulate as many
   characters in the token as possible given it's type.}

  {Any character equal to or less than as space ' ' is considered a
   white character. See isWhite.}
  procedure GetWhite;
  begin
    result := ttWhite;
    while (FNextCharPos <= length(FSource)) and isWhite(FSource[FNextCharPos]) do
      MoveByOneChar;
  end;

  {No distinction is made between integers and floating numbers. It would
   be easy to replace ttNumber with ttInteger and ttFloat and to differentiate
   between the two while parsing the token as indicated below in comments
   beginning with a single star.

   Another approach could be to keep ttNumber and to add a boolean called
   isFloat for example. This is indicated with two starts}
  procedure GetNumber;
  begin
    result := ttNumber; {* := ttInteger as default} {** isFloat := false}
    {parse leading digits}
    while (FNextCharPos <= length(FSource)) and isDigit(FSource[FNextCharPos]) do
      MoveByOneChar;
    {check if decimal point}
    if (FNextCharPos <= length(FSource)) and (FSource[FNextCharPos] = DecimalChar) then
      MoveByOneChar; {* result := ttFloat} {** isFloat := true}
    {parse decimal part if present}
    while (FNextCharPos <= length(FSource)) and isDigit(FSource[FNextCharPos]) do
      MoveByOneChar;
    {parse exponent if present}
    if (FNextCharPos <= length(FSource)) and (upcase(FSource[FNextCharPos]) = 'E') then begin
      MoveByOneChar;
      {parse exponents + or - if present}
      {$IFDEF UNICODE}
      if (FNextCharPos <= length(FSource)) and (CharInSet(FSource[FNextCharPos], ['-','+'])) then
      {$ELSE}
      if (FNextCharPos <= length(FSource)) and (FSource[FNextCharPos] in ['-','+']) then
      {$ENDIF}
        MoveByOneChar;
      {could enforce need for a digit following E

      if FNextCharPos > length(FSource) or not isDigit(FSource[FNextCharPos] do
        RaiseParseException(peInvalidNumber)

      but that would require moving  EMathParserError and RaiseParseException
      to this unit or to ParserTypes.pas}

      {parse exponent if present}
      while (FNextCharPos <= length(FSource)) and isDigit(FSource[FNextCharPos]) do
        MoveByOneChar;
    end;
  end;

  procedure GetOperator(c: char);
  begin
    MoveByOneChar; {skip c}
    case c of
      '(': result := ttLeftParenthesis;
      ')': result := ttRightParenthesis;
      '*': result := ttMult;
      '+': result := ttPlus;
      '-': result := ttMinus;
      '/': result := ttDiv;
    else
      result := ttUnknown;
    end;
  end;

var
  c: char;
begin
  // move past current token
  FTokenStart := FNextCharPos;
  FTokenLength := 0;
  if FNextCharPos > length(FSource) then
    result := ttEOS
  else begin
    c := FSource[FTokenStart];
    if isWhite(c) then
      GetWhite
    else if isDigit(c) then
      GetNumber
    else
      GetOperator(c);
  end;
  FTokenType := result;
end;

procedure TMdTokenizer.Reset;
begin
  FTokenStart := 1;
  FTokenLength := 0;
  FNextCharPos := 1;
  FTokenType := ttUnknown;
end;

procedure TMdTokenizer.SetSource(const value: string);
begin
  FSource := value;
  Reset;
end;

end.
