linguist/samples/Ragel/simple_tokenizer.rl

=begin
%%{
  machine simple_tokenizer;

  action MyTs {
    my_ts = p
  }
  action MyTe {
    my_te = p
  }
  action Emit {
    emit data[my_ts...my_te].pack('c*')
    my_ts = nil
    my_te = nil
  }

  foo = 'STARTFOO' any+ >MyTs :>> 'ENDFOO' >MyTe %Emit;
  main := ( foo | any+ )*;

}%%
=end

# Scans a file for "STARTFOO[...]ENDFOO" blocks and outputs their contents.
#
# ENV['CHUNK_SIZE'] determines how much of the file to read in at a time, allowing you to control memory usage.
#
# Does not use ragel's scanner functionality because no backtracking is needed.
class SimpleTokenizer
  attr_reader :path

  def initialize(path)
    @path = path
    %% write data;
    # % (this fixes syntax highlighting)
  end

  def emit(foo)
    $stdout.puts foo
  end

  def perform
    # So that ragel doesn't try to get it from data.length
    pe = :ignored
    eof = :ignored

    %% write init;
    # % (this fixes syntax highlighting)

    leftover = []
    my_ts = nil
    my_te = nil

    File.open(path) do |f|
      while chunk = f.read(ENV['CHUNK_SIZE'].to_i)
        data = leftover + chunk.unpack('c*')
        p = 0
        pe = data.length
        %% write exec;
        # % (this fixes syntax highlighting)
        if my_ts
          leftover = data[my_ts..-1]
          my_te = my_te - my_ts if my_te
          my_ts = 0
        else
          leftover = []
        end
      end
    end
  end
end

s = SimpleTokenizer.new ARGV[0]
s.perform