Linguist 2.3.2

When testing if a blob is safe to colorize, check size first
Similar to e415a13
2025-10-29 17:50:22 +00:00 · 2012-09-02 00:08:37 -07:00 · 2012-09-02 00:08:37 -07:00 · 2012-08-31 22:47:19 -07:00 · 2012-08-28 10:55:11 -07:00 · 2012-08-28 18:01:46 +01:00
12 changed files with 12094 additions and 5050 deletions
--- a/github-linguist.gemspec
+++ b/github-linguist.gemspec
@@ -1,6 +1,6 @@
 Gem::Specification.new do |s|
  s.name    = 'github-linguist'
-  s.version = '2.3.0'
+  s.version = '2.3.2'
  s.summary = "GitHub Language detection"

  s.authors = "GitHub"
@@ -12,6 +12,7 @@ Gem::Specification.new do |s|
  s.add_dependency 'escape_utils',    '~> 0.2.3'
  s.add_dependency 'mime-types',      '~> 1.19'
  s.add_dependency 'pygments.rb',     '>= 0.2.13'
+  s.add_development_dependency 'mocha'
  s.add_development_dependency 'json'
  s.add_development_dependency 'rake'
  s.add_development_dependency 'yajl-ruby'
--- a/lib/linguist/blob_helper.rb
+++ b/lib/linguist/blob_helper.rb
@@ -160,7 +160,7 @@ module Linguist
    #
    # Return true or false
    def safe_to_colorize?
-      text? && !large? && !high_ratio_of_long_lines?
+      !large? && text? && !high_ratio_of_long_lines?
    end

    # Internal: Does the blob have a ratio of long lines?
@@ -250,7 +250,9 @@ module Linguist
    #
    # Return true or false
    def indexable?
-      if binary?
+      if size > 100 * 1024
+        false
+      elsif binary?
        false
      elsif extname == '.txt'
        true
@@ -260,8 +262,6 @@ module Linguist
        false
      elsif generated?
        false
-      elsif size > 100 * 1024
-        false
      else
        true
      end
--- a/lib/linguist/samples.json
+++ b/lib/linguist/samples.json
--- a/lib/linguist/tokenizer.rb
+++ b/lib/linguist/tokenizer.rb
@@ -16,6 +16,9 @@ module Linguist
      new.extract_tokens(data)
    end

+    # Read up to 100KB
+    BYTE_LIMIT = 100_000
+
    # Start state on token, ignore anything till the next newline
    SINGLE_LINE_COMMENTS = [
      '//', # C
@@ -55,6 +58,8 @@ module Linguist

      tokens = []
      until s.eos?
+        break if s.pos >= BYTE_LIMIT
+
        if token = s.scan(/^#!.+$/)
          if name = extract_shebang(token)
            tokens << "SHEBANG#!#{name}"
--- a/samples/C++/gdsdbreader.h
+++ b/samples/C++/gdsdbreader.h
@@ -0,0 +1,69 @@
+#ifndef GDSDBREADER_H
+#define GDSDBREADER_H
+
+// This file contains core structures, classes and types for the entire gds app
+// WARNING: DO NOT MODIFY UNTIL IT'S STRICTLY NECESSARY
+
+#include <QDir>
+#include "diagramwidget/qgldiagramwidget.h"
+
+#define GDS_DIR "gdsdata"
+
+enum level {LEVEL_ONE, LEVEL_TWO, LEVEL_THREE};
+
+// The internal structure of the db to store information about each node (each level)
+// this will be serialized before being written to file
+class dbDataStructure
+{
+public:
+    QString label;
+    quint32 depth;
+    quint32 userIndex;
+    QByteArray data;    // This is COMPRESSED data, optimize ram and disk space, is decompressed
+                        // just when needed (to display the comments)
+
+    // The following ID is used to create second-third level files
+    quint64 uniqueID;
+    // All the next items linked to this one
+    QVector<dbDataStructure*> nextItems;
+    // Corresponding indices vector (used to store data)
+    QVector<quint32> nextItemsIndices;
+    // The father element (or NULL if it's root)
+    dbDataStructure* father;
+    // Corresponding indices vector (used to store data)
+    quint32 fatherIndex;
+    bool noFatherRoot; // Used to tell if this node is the root (so hasn't a father)
+
+    // These fields will be useful for levels 2 and 3
+    QString fileName; // Relative filename for the associated code file
+    QByteArray firstLineData; // Compressed first line data, this will be used with the line number to retrieve info
+    QVector<quint32> linesNumbers; // First and next lines (next are relative to the first) numbers
+
+    // -- Generic system data not to be stored on disk
+    void *glPointer; // GL pointer
+
+    // These operator overrides prevent the glPointer and other non-disk-necessary data serialization
+    friend QDataStream& operator<<(QDataStream& stream, const dbDataStructure& myclass)
+    // Notice: this function has to be "friend" because it cannot be a member function, member functions
+    // have an additional parameter "this" which isn't in the argument list of an operator overload. A friend
+    // function has full access to private data of the class without having the "this" argument
+    {
+        // Don't write glPointer and every pointer-dependent structure
+        return stream << myclass.label << myclass.depth << myclass.userIndex << qCompress(myclass.data)
+                         << myclass.uniqueID << myclass.nextItemsIndices << myclass.fatherIndex << myclass.noFatherRoot
+                            << myclass.fileName << qCompress(myclass.firstLineData) << myclass.linesNumbers;
+    }
+    friend QDataStream& operator>>(QDataStream& stream, dbDataStructure& myclass)
+    {
+        //Don't read it, either
+        stream >> myclass.label >> myclass.depth >> myclass.userIndex >> myclass.data
+                      >> myclass.uniqueID >> myclass.nextItemsIndices >> myclass.fatherIndex >> myclass.noFatherRoot
+                         >> myclass.fileName >> myclass.firstLineData >> myclass.linesNumbers;
+        myclass.data = qUncompress(myclass.data);
+        myclass.firstLineData = qUncompress(myclass.firstLineData);
+        return stream;
+    }
+
+};
+
+#endif // GDSDBREADER_H
--- a/samples/C/rf_io.c
+++ b/samples/C/rf_io.c
--- a/samples/C/rf_io.h
+++ b/samples/C/rf_io.h
@@ -0,0 +1,682 @@
+/**
+** Copyright (c) 2011-2012, Karapetsas Eleftherios
+** All rights reserved.
+**
+** Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+**  1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+**  2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in
+**     the documentation and/or other materials provided with the distribution.
+**  3. Neither the name of the Original Author of Refu nor the names of its contributors may be used to endorse or promote products derived from
+**
+**  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+**  INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+**  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+**  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+**  SERVICES;LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+**  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+**  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**/
+
+
+#ifndef REFU_IO_H
+#define REFU_IO_H
+
+#include <rf_setup.h>
+#include <stdio.h>
+
+#ifdef __cplusplus
+extern "C"
+{// opening bracket for calling from C++
+#endif
+
+// New line feed
+#define RF_LF   0xA
+// Carriage Return
+#define RF_CR   0xD
+
+#ifdef REFU_WIN32_VERSION
+    #define i_PLUSB_WIN32   "b"
+#else
+    #define i_PLUSB_WIN32   ""
+#endif
+
+// This is the type that represents the file offset
+#ifdef _MSC_VER
+typedef __int64 foff_rft;
+#else
+#include <sys/types.h>
+typedef off64_t foff_rft;
+#endif
+///Fseek and Ftelll definitions
+#ifdef _MSC_VER
+    #define rfFseek(i_FILE_,i_OFFSET_,i_WHENCE_)    _fseeki64(i_FILE_,i_OFFSET_,i_WHENCE_)
+    #define rfFtell(i_FILE_)                        _ftelli64(i_FILE_)
+#else
+    #define rfFseek(i_FILE_,i_OFFSET_,i_WHENCE_)    fseeko64(i_FILE_,i_OFFSET_,i_WHENCE_)
+    #define rfFtell(i_FILE_)                        ftello64(i_FILE_)
+#endif
+
+/**
+** @defgroup RF_IOGRP I/O
+** @addtogroup RF_IOGRP
+** @{
+**/
+
+// @brief Reads a UTF-8 file descriptor until end of line or EOF is found and returns a UTF-8 byte buffer
+//
+// The file descriptor at @c f must have been opened in <b>binary</b> and not text mode. That means that if under
+// Windows make sure to call fopen with "wb", "rb" e.t.c. instead of the simple "w", "r" e.t.c. since the initial
+// default value under Windows is text mode. Alternatively you can set the initial value using _get_fmode() and
+// _set_fmode(). For more information take a look at the msdn pages here:
+// http://msdn.microsoft.com/en-us/library/ktss1a9b.aspx
+//
+// When the compile flag @c RF_NEWLINE_CRLF is defined (the default case at Windows) then this function
+// shall not be adding any CR character that is found in the file behind a newline character since this is
+// the Windows line ending scheme. Beware though that the returned  read bytes value shall still count the CR character inside.
+//
+// @param[in] f The file descriptor to read
+// @param[out] utf8 Give here a refence to an unitialized char* that will be allocated inside the function
+// and contain the utf8 byte buffer. Needs to be freed by the caller explicitly later
+// @param[out] byteLength Give an @c uint32_t here to receive the length of the @c utf8 buffer in bytes
+// @param[out] bufferSize Give an @c uint32_t here to receive the capacity of the @c utf8 buffer in bytes
+// @param[out] eof Pass a pointer to a char to receive a true or false value in case the end of file
+// with reading this line
+// @return Returns either a positive number for success that represents the number of bytes read from @c f and and error in case something goes wrong.
+// The possible errors to return are the same as rfFgets_UTF8()
+i_DECLIMEX_ int32_t rfFReadLine_UTF8(FILE* f,char** utf8,uint32_t* byteLength,uint32_t* bufferSize,char* eof);
+// @brief Reads a Big Endian UTF-16 file descriptor until end of line or EOF is found and returns a UTF-8 byte buffer
+//
+// The file descriptor at @c f must have been opened in <b>binary</b> and not text mode. That means that if under
+// Windows make sure to call fopen with "wb", "rb" e.t.c. instead of the simple "w", "r" e.t.c. since the initial
+// default value under Windows is text mode. Alternatively you can set the initial value using _get_fmode() and
+// _set_fmode(). For more information take a look at the msdn pages here:
+// http://msdn.microsoft.com/en-us/library/ktss1a9b.aspx
+//
+// When the compile flag @c RF_NEWLINE_CRLF is defined (the default case at Windows) then this function
+// shall not be adding any CR character that is found in the file behind a newline character since this is
+// the Windows line ending scheme. Beware though that the returned  read bytes value shall still count the CR character inside.
+//
+// @param[in] f The file descriptor to read
+// @param[out] utf8 Give here a refence to an unitialized char* that will be allocated inside the function
+// and contain the utf8 byte buffer. Needs to be freed by the caller explicitly later
+// @param[out] byteLength Give an @c uint32_t here to receive the length of the @c utf8 buffer in bytes
+// @param[out] eof Pass a pointer to a char to receive a true or false value in case the end of file
+// with reading this line
+// @return Returns either a positive number for success that represents the number of bytes read from @c f and and error in case something goes wrong.
+// + Any error that can be returned by @ref rfFgets_UTF16BE()
+// + @c RE_UTF16_INVALID_SEQUENCE: Failed to decode the UTF-16 byte stream of the file descriptor
+// + @c RE_UTF8_ENCODING: Failed to encode the UTF-16 of the file descriptor into UTF-8
+i_DECLIMEX_ int32_t rfFReadLine_UTF16BE(FILE* f,char** utf8,uint32_t* byteLength,char* eof);
+// @brief Reads a Little Endian UTF-16 file descriptor until end of line or EOF is found and returns a UTF-8 byte buffer
+//
+// The file descriptor at @c f must have been opened in <b>binary</b> and not text mode. That means that if under
+// Windows make sure to call fopen with "wb", "rb" e.t.c. instead of the simple "w", "r" e.t.c. since the initial
+// default value under Windows is text mode. Alternatively you can set the initial value using _get_fmode() and
+// _set_fmode(). For more information take a look at the msdn pages here:
+// http://msdn.microsoft.com/en-us/library/ktss1a9b.aspx
+//
+// When the compile flag @c RF_NEWLINE_CRLF is defined (the default case at Windows) then this function
+// shall not be adding any CR character that is found in the file behind a newline character since this is
+// the Windows line ending scheme. Beware though that the returned read bytes value shall still count the CR character inside.
+//
+// @param[in] f The file descriptor to read
+// @param[out] utf8 Give here a refence to an unitialized char* that will be allocated inside the function
+// and contain the utf8 byte buffer. Needs to be freed by the caller explicitly later
+// @param[out] byteLength Give an @c uint32_t here to receive the length of the @c utf8 buffer in bytes
+// @param[out] eof Pass a pointer to a char to receive a true or false value in case the end of file
+// with reading this line
+// @return Returns either a positive number for success that represents the number of bytes read from @c f and and error in case something goes wrong.
+// + Any error that can be returned by @ref rfFgets_UTF16LE()
+// + @c RE_UTF16_INVALID_SEQUENCE: Failed to decode the UTF-16 byte stream of the file descriptor
+// + @c RE_UTF8_ENCODING: Failed to encode the UTF-16 of the file descriptor into UTF-8
+i_DECLIMEX_ int32_t rfFReadLine_UTF16LE(FILE* f,char** utf8,uint32_t* byteLength,char* eof);
+
+// @brief Reads a Big Endian UTF-32 file descriptor until end of line or EOF is found and returns a UTF-8 byte buffer
+//
+// The file descriptor at @c f must have been opened in <b>binary</b> and not text mode. That means that if under
+// Windows make sure to call fopen with "wb", "rb" e.t.c. instead of the simple "w", "r" e.t.c. since the initial
+// default value under Windows is text mode. Alternatively you can set the initial value using _get_fmode() and
+// _set_fmode(). For more information take a look at the msdn pages here:
+// http://msdn.microsoft.com/en-us/library/ktss1a9b.aspx
+//
+// When the compile flag @c RF_NEWLINE_CRLF is defined (the default case at Windows) then this function
+// shall not be adding any CR character that is found in the file behind a newline character since this is
+// the Windows line ending scheme. Beware though that the returned read bytes value shall still count the CR character inside.
+//
+// @param[in] f The file descriptor to read
+// @param[out] utf8 Give here a refence to an unitialized char* that will be allocated inside the function
+// and contain the utf8 byte buffer. Needs to be freed by the caller explicitly later
+// @param[out] byteLength Give an @c uint32_t here to receive the length of the @c utf8 buffer in bytes
+// @param[out] eof Pass a pointer to a char to receive a true or false value in case the end of file
+// with reading this line
+// @return Returns either a positive number for success that represents the number of bytes read from @c f and and error in case something goes wrong.
+// + Any error that can be returned by @ref rfFgets_UTF32BE()
+// + @c RE_UTF8_ENCODING: Failed to encode the UTF-16 of the file descriptor into UTF-8
+i_DECLIMEX_ int32_t rfFReadLine_UTF32BE(FILE* f,char** utf8,uint32_t* byteLength,char* eof);
+// @brief Reads a Little Endian UTF-32 file descriptor until end of line or EOF is found and returns a UTF-8 byte buffer
+//
+// The file descriptor at @c f must have been opened in <b>binary</b> and not text mode. That means that if under
+// Windows make sure to call fopen with "wb", "rb" e.t.c. instead of the simple "w", "r" e.t.c. since the initial
+// default value under Windows is text mode. Alternatively you can set the initial value using _get_fmode() and
+// _set_fmode(). For more information take a look at the msdn pages here:
+// http://msdn.microsoft.com/en-us/library/ktss1a9b.aspx
+//
+// When the compile flag @c RF_NEWLINE_CRLF is defined (the default case at Windows) then this function
+// shall not be adding any CR character that is found in the file behind a newline character since this is
+// the Windows line ending scheme. Beware though that the returned read bytes value shall still count the CR character inside.
+//
+// @param[in] f The file descriptor to read
+// @param[out] utf8 Give here a refence to an unitialized char* that will be allocated inside the function
+// and contain the utf8 byte buffer. Needs to be freed by the caller explicitly later
+// @param[out] byteLength Give an @c uint32_t here to receive the length of the @c utf8 buffer in bytes
+// @param[out] eof Pass a pointer to a char to receive a true or false value in case the end of file
+// with reading this line
+// @return Returns either a positive number for success that represents the number of bytes read from @c f and and error in case something goes wrong.
+// + Any error that can be returned by @ref rfFgets_UTF32LE()
+// + @c RE_UTF8_ENCODING: Failed to encode the UTF-16 of the file descriptor into UTF-8
+i_DECLIMEX_ int32_t rfFReadLine_UTF32LE(FILE* f,char** utf8,uint32_t* byteLength,char* eof);
+
+// @brief Gets a number of bytes from a BIG endian UTF-32 file descriptor
+//
+// This is a function that's similar to c library fgets but it also returns the number of bytes read. Reads in from the file until @c num bytes
+// have been read or new line or EOF character has been encountered.
+//
+// The function will read until @c num characters are read and if @c num
+// would take us to the middle of a UTF32 character then the next character shall also be read
+// and the function will return the number of bytes read.
+// Since the function null terminates the buffer the given @c buff needs to be of at least
+// @c num+7 size to cater for the worst case.
+//
+// The final bytestream stored inside @c buff is in the endianess of the system.
+//
+// If right after the last character read comes the EOF, the function
+// shall detect so and assign @c true to @c eof.
+//
+// In Windows where file endings are in the form of 2 bytes CR-LF (Carriage return - NewLine) this function
+// shall just ignore the carriage returns and not return it inside the return buffer at @c buff.
+//
+// The file descriptor at @c f must have been opened in <b>binary</b> and not text mode. That means that if under
+// Windows make sure to call fopen with "wb", "rb" e.t.c. instead of the simple "w", "r" e.t.c. since the initial
+// default value under Windows is text mode. Alternatively you can set the initial value using _get_fmode() and
+// _set_fmode(). For more information take a look at the msdn pages here:
+// http://msdn.microsoft.com/en-us/library/ktss1a9b.aspx
+//
+// @param[in] buff A buffer to be filled with the contents of the file. Should be of size at least @c num+7
+// @param[in] num The maximum number of bytes to read from within the file NOT including the null terminating character(which in itelf is 4 bytes). Should be a multiple of 4
+// @param[in] f A valid FILE descriptor from which to read the bytes
+// @param[out] eof Pass a reference to a char to receive a true/false value for whether EOF has been reached.
+// @return Returns the actual number of bytes read or an error if there was a problem.
+// The possible errors are:
+// + @c RE_FILE_READ: If during reading the file there was an unknown read error
+// + @c RE_FILE_READ_BLOCK: If the read operation failed due to the file descriptor being occupied by another thread
+// + @c RE_FILE_MODE: If during reading the file the file descriptor's mode was not correctly set for reading
+// + @c RE_FILE_POS_OVERFLOW: If during reading, the current file position can't be represented by the system
+// + @c RE_INTERRUPT: If during reading, there was a system interrupt
+// + @c RE_FILE_IO: If there was a physical I/O error
+// + @c RE_FILE_NOSPACE: If reading failed due to insufficient storage space
+i_DECLIMEX_ int32_t rfFgets_UTF32BE(char* buff,uint32_t num,FILE* f,char* eof);
+// @brief Gets a number of bytes from a Little endian UTF-32 file descriptor
+//
+// This is a function that's similar to c library fgets but it also returns the number of bytes read. Reads in from the file until @c num bytes
+// have been read or new line or EOF character has been encountered.
+//
+// The function will read until @c num characters are read and if @c num
+// would take us to the middle of a UTF32 character then the next character shall also be read
+// and the function will return the number of bytes read.
+// Since the function null terminates the buffer the given @c buff needs to be of at least
+// @c num+7 size to cater for the worst case.
+//
+// The final bytestream stored inside @c buff is in the endianess of the system.
+//
+// If right after the last character read comes the EOF, the function
+// shall detect so and assign @c true to @c eof.
+//
+// In Windows where file endings are in the form of 2 bytes CR-LF (Carriage return - NewLine) this function
+// shall just ignore the carriage returns and not return it inside the return buffer at @c buff.
+//
+// The file descriptor at @c f must have been opened in <b>binary</b> and not text mode. That means that if under
+// Windows make sure to call fopen with "wb", "rb" e.t.c. instead of the simple "w", "r" e.t.c. since the initial
+// default value under Windows is text mode. Alternatively you can set the initial value using _get_fmode() and
+// _set_fmode(). For more information take a look at the msdn pages here:
+// http://msdn.microsoft.com/en-us/library/ktss1a9b.aspx
+//
+// @param[in] buff A buffer to be filled with the contents of the file. Should be of size at least @c num+7
+// @param[in] num The maximum number of bytes to read from within the file NOT including the null terminating character(which in itelf is 4 bytes). Should be a multiple of 4
+// @param[in] f A valid FILE descriptor from which to read the bytes
+// @param[out] eof Pass a reference to a char to receive a true/false value for whether EOF has been reached.
+// @return Returns the actual number of bytes read or an error if there was a problem.
+// The possible errors are:
+// + @c RE_FILE_READ: If during reading the file there was an unknown read error
+// + @c RE_FILE_READ_BLOCK: If the read operation failed due to the file descriptor being occupied by another thread
+// + @c RE_FILE_MODE: If during reading the file the file descriptor's mode was not correctly set for reading
+// + @c RE_FILE_POS_OVERFLOW: If during reading, the current file position can't be represented by the system
+// + @c RE_INTERRUPT: If during reading, there was a system interrupt
+// + @c RE_FILE_IO: If there was a physical I/O error
+// + @c RE_FILE_NOSPACE: If reading failed due to insufficient storage space
+i_DECLIMEX_ int32_t rfFgets_UTF32LE(char* buff,uint32_t num,FILE* f,char* eof);
+
+// @brief Gets a number of bytes from a BIG endian UTF-16 file descriptor
+//
+// This is a function that's similar to c library fgets but it also returns the number of bytes read. Reads in from the file until @c num bytes
+// have been read or new line or EOF character has been encountered.
+//
+// The function will read until @c num characters are read and if @c num
+// would take us to the middle of a UTF16 character then the next character shall also be read
+// and the function will return the number of bytes read.
+// Since the function null terminates the buffer the given @c buff needs to be of at least
+// @c num+5 size to cater for the worst case.
+//
+// The final bytestream stored inside @c buff is in the endianess of the system.
+//
+// If right after the last character read comes the EOF, the function
+// shall detect so and assign @c true to @c eof.
+//
+// In Windows where file endings are in the form of 2 bytes CR-LF (Carriage return - NewLine) this function
+// shall just ignore the carriage returns and not return it inside the return buffer at @c buff.
+//
+// The file descriptor at @c f must have been opened in <b>binary</b> and not text mode. That means that if under
+// Windows make sure to call fopen with "wb", "rb" e.t.c. instead of the simple "w", "r" e.t.c. since the initial
+// default value under Windows is text mode. Alternatively you can set the initial value using _get_fmode() and
+// _set_fmode(). For more information take a look at the msdn pages here:
+// http://msdn.microsoft.com/en-us/library/ktss1a9b.aspx
+//
+// @param[in] buff A buffer to be filled with the contents of the file. Should be of size at least @c num+5
+// @param[in] num The maximum number of bytes to read from within the file NOT including the null terminating character(which in itelf is 2 bytes). Should be a multiple of 2
+// @param[in] f A valid FILE descriptor from which to read the bytes
+// @param[out] eof Pass a reference to a char to receive a true/false value for whether EOF has been reached.
+// @return Returns the actual number of bytes read or an error if there was a problem.
+// The possible errors are:
+// + @c RE_FILE_READ: If during reading the file there was an unknown read error
+// + @c RE_FILE_READ_BLOCK: If the read operation failed due to the file descriptor being occupied by another thread
+// + @c RE_FILE_MODE: If during reading the file the file descriptor's mode was not correctly set for reading
+// + @c RE_FILE_POS_OVERFLOW: If during reading, the current file position can't be represented by the system
+// + @c RE_INTERRUPT: If during reading, there was a system interrupt
+// + @c RE_FILE_IO: If there was a physical I/O error
+// + @c RE_FILE_NOSPACE: If reading failed due to insufficient storage space
+i_DECLIMEX_ int32_t rfFgets_UTF16BE(char* buff,uint32_t num,FILE* f,char* eof);
+// @brief Gets a number of bytes from a Little endian UTF-16 file descriptor
+//
+// This is a function that's similar to c library fgets but it also returns the number of bytes read. Reads in from the file until @c num bytes
+// have been read or new line or EOF character has been encountered.
+//
+// The function will read until @c num characters are read and if @c num
+// would take us to the middle of a UTF16 character then the next character shall also be read
+// and the function will return the number of bytes read.
+// Since the function null terminates the buffer the given @c buff needs to be of at least
+// @c num+5 size to cater for the worst case.
+//
+// The final bytestream stored inside @c buff is in the endianess of the system.
+//
+// If right after the last character read comes the EOF, the function
+// shall detect so and assign @c true to @c eof.
+//
+// In Windows where file endings are in the form of 2 bytes CR-LF (Carriage return - NewLine) this function
+// shall just ignore the carriage returns and not return it inside the return buffer at @c buff.
+//
+// The file descriptor at @c f must have been opened in <b>binary</b> and not text mode. That means that if under
+// Windows make sure to call fopen with "wb", "rb" e.t.c. instead of the simple "w", "r" e.t.c. since the initial
+// default value under Windows is text mode. Alternatively you can set the initial value using _get_fmode() and
+// _set_fmode(). For more information take a look at the msdn pages here:
+// http://msdn.microsoft.com/en-us/library/ktss1a9b.aspx
+//
+// @param[in] buff A buffer to be filled with the contents of the file. Should be of size at least @c num+2
+// @param[in] num The maximum number of bytes to read from within the file NOT including the null terminating character(which in itelf is 2 bytes). Should be a multiple of 2
+// @param[in] f A valid FILE descriptor from which to read the bytes
+// @param[out] eof Pass a reference to a char to receive a true/false value for whether EOF has been reached.
+// @return Returns the actual number of bytes read or an error if there was a problem.
+// The possible errors are:
+// + @c RE_FILE_READ: If during reading the file there was an unknown read error
+// + @c RE_FILE_READ_BLOCK: If the read operation failed due to the file descriptor being occupied by another thread
+// + @c RE_FILE_MODE: If during reading the file the file descriptor's mode was not correctly set for reading
+// + @c RE_FILE_POS_OVERFLOW: If during reading, the current file position can't be represented by the system
+// + @c RE_INTERRUPT: If during reading, there was a system interrupt
+// + @c RE_FILE_IO: If there was a physical I/O error
+// + @c RE_FILE_NOSPACE: If reading failed due to insufficient storage space
+i_DECLIMEX_ int32_t rfFgets_UTF16LE(char* buff,uint32_t num,FILE* f,char* eof);
+// @brief Gets a number of bytes from a UTF-8 file descriptor
+//
+// This is a function that's similar to c library fgets but it also returns the number of bytes read. Reads in from the file until @c num characters
+// have been read or new line or EOF character has been encountered.
+//
+// The function  automatically adds a null termination character at the end of
+// @c buff but this character is not included in the returned actual number of bytes.
+//
+// The function will read until @c num characters are read and if @c num
+// would take us to the middle of a UTF8 character then the next character shall also be read
+// and the function will return the number of bytes read.
+// Since the function null terminates the buffer the given @c buff needs to be of at least
+// @c num+4 size to cater for the worst case.
+//
+// If right after the last character read comes the EOF, the function
+// shall detect so and assign @c true to @c eof.
+//
+// In Windows where file endings are in the form of 2 bytes CR-LF (Carriage return - NewLine) this function
+// shall just ignore the carriage returns and not return it inside the return buffer at @c buff.
+//
+// The file descriptor at @c f must have been opened in <b>binary</b> and not text mode. That means that if under
+// Windows make sure to call fopen with "wb", "rb" e.t.c. instead of the simple "w", "r" e.t.c. since the initial
+// default value under Windows is text mode. Alternatively you can set the initial value using _get_fmode() and
+// _set_fmode(). For more information take a look at the msdn pages here:
+// http://msdn.microsoft.com/en-us/library/ktss1a9b.aspx
+//
+// @param[in] buff A buffer to be filled with the contents of the file. Should of size at least @c num+4
+// @param[in] num The maximum number of bytes to read from within the file NOT including the null terminating character(which in itelf is 1 byte)
+// @param[in] f A valid FILE descriptor from which to read the bytes
+// @param[out] eof Pass a reference to a char to receive a true/false value for whether EOF has been reached.
+// @return Returns the actual number of bytes read or an error if there was a problem.
+// The possible errors are:
+// + @c RE_UTF8_INVALID_SEQUENCE_INVALID_BYTE: If an invalid UTF-8 byte has been found
+// + @c RE_UTF8_INVALID_SEQUENCE_CONBYTE: If during parsing the file we were expecting a continuation
+// byte and did not find it
+// + @c RE_UTF8_INVALID_SEQUENCE_END: If the null character is encountered in between bytes that should
+// have been continuation bytes
+// + @c RE_FILE_READ: If during reading the file there was an unknown read error
+// + @c RE_FILE_READ_BLOCK: If the read operation failed due to the file descriptor being occupied by another thread
+// + @c RE_FILE_MODE: If during reading the file the file descriptor's mode was not correctly set for reading
+// + @c RE_FILE_POS_OVERFLOW: If during reading, the current file position can't be represented by the system
+// + @c RE_INTERRUPT: If during reading, there was a system interrupt
+// + @c RE_FILE_IO: If there was a physical I/O error
+// + @c RE_FILE_NOSPACE: If reading failed due to insufficient storage space
+i_DECLIMEX_ int32_t rfFgets_UTF8(char* buff,uint32_t num,FILE* f,char* eof);
+
+// @brief  Gets a unicode character from a UTF-8 file descriptor
+//
+// This function attempts to assume a more modern fgetc() role for UTF-8 encoded files.
+// Reads bytes from the File descriptor @c f until a full UTF-8 unicode character has been read
+//
+// After this function the file pointer will have moved either by @c 1, @c 2, @c 3 or @c 4
+// bytes if the return value is positive. You can see how much by checking the return value.
+//
+// You shall need to provide an integer at @c c to contain either the decoded Unicode
+// codepoint or the UTF-8 endoced byte depending on the value of the @c cp argument.
+//
+// @param f A valid FILE descriptor from which to read the bytes
+// @param c Pass an int that will receive either the unicode code point value or
+// the UTF8 bytes depending on the value of the @c cp flag
+// @param cp A boolean flag. If @c true then the int passed at @c c will contain the unicode code point
+// of the read character, so the UTF-8 will be decoded.
+// If @c false the int passed at @c c will contain the value of the read bytes in UTF-8 without any decoding
+// @return Returns the number of bytes read (either @c 1, @c 2, @c 3 or @c 4) or an error if the function
+// fails for some reason. Possible error values are:
+// + @c RE_FILE_EOF: The end of file has been found while reading. If the end of file is encountered
+// in the middle of a UTF-8 encoded character where we would be expecting something different
+// and @c RE_UTF8_INVALID_SEQUENCE_END error is also logged
+// + @c RE_UTF8_INVALID_SEQUENCE_INVALID_BYTE: If an invalid UTF-8 byte has been found
+// + @c RE_UTF8_INVALID_SEQUENCE_CONBYTE: If during parsing the file we were expecting a continuation
+// byte and did not find it
+// + @c RE_UTF8_INVALID_SEQUENCE_END: If the null character is encountered in between bytes that should
+// have been continuation bytes
+// + @c RE_FILE_READ: If during reading the file there was an unknown read error
+// + @c RE_FILE_READ_BLOCK: If the read operation failed due to the file descriptor being occupied by another thread
+// + @c RE_FILE_MODE: If during reading the file the file descriptor's mode was not correctly set for reading
+// + @c RE_FILE_POS_OVERFLOW: If during reading, the current file position can't be represented by the system
+// + @c RE_INTERRUPT: If during reading, there was a system interrupt
+// + @c RE_FILE_IO: If there was a physical I/O error
+// + @c RE_FILE_NOSPACE: If reading failed due to insufficient storage space
+i_DECLIMEX_ int32_t rfFgetc_UTF8(FILE* f,uint32_t *c,char cp);
+// @brief  Gets a unicode character from a UTF-16 Big Endian file descriptor
+//
+// This function attempts to assume a more modern fgetc() role for UTF-16 encoded files.
+// Reads bytes from the File descriptor @c f until a full UTF-16 unicode character has been read
+//
+// After this function the file pointer will have moved either by @c 2 or @c 4
+// bytes if the return value is positive. You can see how much by checking the return value.
+//
+// You shall need to provide an integer at @c c to contain either the decoded Unicode
+// codepoint or the Bigendian encoded UTF-16 bytes depending on the value of @c the cp argument.
+//
+// @param f A valid FILE descriptor from which to read the bytes
+// @param c Pass an int that will receive either the unicode code point value or
+// the UTF16 bytes depending on the value of the @c cp flag
+// @param cp A boolean flag. If @c true then the int passed at @c c will contain the unicode code point
+// of the read character, so the UTF-16 will be decoded.
+// If @c false the int passed at @c c will contain the value of the read bytes in UTF-16 without any decoding
+// @return Returns the number of bytes read (either @c 2 or @c 4) or an error if the function
+// fails for some reason. Possible error values are:
+// + @c RE_UTF16_INVALID_SEQUENCE: Either the read word or its surrogate pair if 4 bytes were read held illegal values
+// + @c RE_UTF16_NO_SURRPAIR: According to the first read word a surrogate pair was expected but none was found
+// + @c RE_FILE_EOF: The end of file has been found while reading. If the end of file is encountered
+// while we expect a UTF-16 surrogate pair an appropriate error is logged
+// + @c RE_FILE_READ: If during reading the file there was an unknown read error
+// + @c RE_FILE_READ_BLOCK: If the read operation failed due to the file descriptor being occupied by another thread
+// + @c RE_FILE_MODE: If during reading the file the file descriptor's mode was not correctly set for reading
+// + @c RE_FILE_POS_OVERFLOW: If during reading, the current file position can't be represented by the system
+// + @c RE_INTERRUPT: If during reading, there was a system interrupt
+// + @c RE_FILE_IO: If there was a physical I/O error
+// + @c RE_FILE_NOSPACE: If reading failed due to insufficient storage space
+i_DECLIMEX_ int32_t rfFgetc_UTF16BE(FILE* f,uint32_t *c,char cp);
+// @brief  Gets a unicode character from a UTF-16 Little Endian file descriptor
+//
+// This function attempts to assume a more modern fgetc() role for UTF-16 encoded files.
+// Reads bytes from the File descriptor @c f until a full UTF-16 unicode character has been read
+//
+// After this function the file pointer will have moved either by @c 2 or @c 4
+// bytes if the return value is positive. You can see how much by checking the return value.
+//
+// You shall need to provide an integer at @c c to contain either the decoded Unicode
+// codepoint or the Bigendian encoded UTF-16 bytes depending on the value of @c the cp argument.
+//
+// @param f A valid FILE descriptor from which to read the bytes
+// @param c Pass an int that will receive either the unicode code point value or
+// the UTF16 bytes depending on the value of the @c cp flag
+// @param cp A boolean flag. If @c true then the int passed at @c c will contain the unicode code point
+// of the read character, so the UTF-16 will be decoded.
+// If @c false the int passed at @c c will contain the value of the read bytes in UTF-16 without any decoding
+// @return Returns the number of bytes read (either @c 2 or @c 4) or an error if the function
+// fails for some reason. Possible error values are:
+// + @c RE_UTF16_INVALID_SEQUENCE: Either the read word or its surrogate pair if 4 bytes were read held illegal values
+// + @c RE_UTF16_NO_SURRPAIR: According to the first read word a surrogate pair was expected but none was found
+// + @c RE_FILE_EOF: The end of file has been found while reading. If the end of file is encountered
+// while we expect a UTF-16 surrogate pair an appropriate error is logged
+// + @c RE_FILE_READ: If during reading the file there was an unknown read error
+// + @c RE_FILE_READ_BLOCK: If the read operation failed due to the file descriptor being occupied by another thread
+// + @c RE_FILE_MODE: If during reading the file the file descriptor's mode was not correctly set for reading
+// + @c RE_FILE_POS_OVERFLOW: If during reading, the current file position can't be represented by the system
+// + @c RE_INTERRUPT: If during reading, there was a system interrupt
+// + @c RE_FILE_IO: If there was a physical I/O error
+// + @c RE_FILE_NOSPACE: If reading failed due to insufficient storage space
+i_DECLIMEX_ int32_t rfFgetc_UTF16LE(FILE* f,uint32_t *c,char cp);
+// @brief  Gets a unicode character from a UTF-32 Little Endian file descriptor
+//
+// This function attempts to assume a more modern fgetc() role for UTF-32 encoded files.
+// Reads bytes from the File descriptor @c f until a full UTF-32 unicode character has been read
+//
+// After this function the file pointer will have moved by @c 4
+// bytes if the return value is positive.
+//
+// You shall need to provide an integer at @c to contain the UTF-32 codepoint.
+//
+// @param f A valid FILE descriptor from which to read the bytes
+// @param c Pass an int that will receive either the unicode code point value or
+// the UTF16 bytes depending on the value of the @c cp flag
+// If @c false the int passed at @c c will contain the value of the read bytes in UTF-16 without any decoding
+// @return Returns either @c RF_SUCCESS for succesfull readin or one of the following errors:
+// + @c RE_FILE_EOF: The end of file has been found while reading.
+// + @c RE_FILE_READ: If during reading the file there was an unknown read error
+// + @c RE_FILE_READ_BLOCK: If the read operation failed due to the file descriptor being occupied by another thread
+// + @c RE_FILE_MODE: If during reading the file the file descriptor's mode was not correctly set for reading
+// + @c RE_FILE_POS_OVERFLOW: If during reading, the current file position can't be represented by the system
+// + @c RE_INTERRUPT: If during reading, there was a system interrupt
+// + @c RE_FILE_IO: If there was a physical I/O error
+// + @c RE_FILE_NOSPACE: If reading failed due to insufficient storage space
+i_DECLIMEX_ int32_t rfFgetc_UTF32LE(FILE* f,uint32_t *c);
+// @brief  Gets a unicode character from a UTF-32 Big Endian file descriptor
+//
+// This function attempts to assume a more modern fgetc() role for UTF-32 encoded files.
+// Reads bytes from the File descriptor @c f until a full UTF-32 unicode character has been read
+//
+// After this function the file pointer will have moved by @c 4
+// bytes if the return value is positive.
+//
+// You shall need to provide an integer at @c to contain the UTF-32 codepoint.
+//
+// @param f A valid FILE descriptor from which to read the bytes
+// @param c Pass an int that will receive either the unicode code point value or
+// the UTF16 bytes depending on the value of the @c cp flag
+// If @c false the int passed at @c c will contain the value of the read bytes in UTF-16 without any decoding
+// @return Returns either @c RF_SUCCESS for succesfull readin or one of the following errors:
+// + @c RE_FILE_EOF: The end of file has been found while reading.
+// + @c RE_FILE_READ: If during reading the file there was an unknown read error
+// + @c RE_FILE_READ_BLOCK: If the read operation failed due to the file descriptor being occupied by another thread
+// + @c RE_FILE_MODE: If during reading the file the file descriptor's mode was not correctly set for reading
+// + @c RE_FILE_POS_OVERFLOW: If during reading, the current file position can't be represented by the system
+// + @c RE_INTERRUPT: If during reading, there was a system interrupt
+// + @c RE_FILE_IO: If there was a physical I/O error
+// + @c RE_FILE_NOSPACE: If reading failed due to insufficient storage space
+i_DECLIMEX_ int32_t rfFgetc_UTF32BE(FILE* f,uint32_t *c);
+
+// @brief Moves a unicode character backwards in a big endian UTF-32 file stream
+//
+// @param f The file stream
+// @param c Returns the character we moved back to as a unicode codepoint
+// @return Returns either @c RF_SUCCESS for success or one of the following errors:
+// + @c RE_FILE_POS_OVERFLOW: If during trying to read the current file's position it can't be represented by the system
+// + @c RE_FILE_BAD: If The file descriptor is corrupt/illegal
+// + @c RE_FILE_NOTFILE: If the file descriptor is not a file but something else. e.g. socket.
+// + @c RE_FILE_GETFILEPOS: If the file's position could not be retrieved for some unknown reason
+// + @c RE_FILE_WRITE_BLOCK: While attempting to move the file pointer, it was occupied by another thread, and the no block flag was set
+// + @c RE_INTERRUPT: Operating on the file failed due to a system interrupt
+// + @c RE_FILE_IO: There was a physical I/O error
+// + @c RE_FILE_NOSPACE: There was no space on the device holding the file
+// + @c RE_FILE_NOTFILE: The device we attempted to manipulate is non-existent
+// + @c RE_FILE_READ: If during reading the file there was an error
+// + @c RE_FILE_READ_BLOCK: If during reading the file the read operation failed due to the file being occupied by another thread
+// + @c RE_FILE_MODE: If during reading the file the underlying file descriptor's mode was not correctly set for reading
+i_DECLIMEX_ int32_t rfFback_UTF32BE(FILE* f,uint32_t *c);
+// @brief Moves a unicode character backwards in a little endian UTF-32 file stream
+//
+// The file descriptor at @c f must have been opened in <b>binary</b> and not text mode. That means that if under
+// Windows make sure to call fopen with "wb", "rb" e.t.c. instead of the simple "w", "r" e.t.c. since the initial
+// default value under Windows is text mode. Alternatively you can set the initial value using _get_fmode() and
+// _set_fmode(). For more information take a look at the msdn pages here:
+// http://msdn.microsoft.com/en-us/library/ktss1a9b.aspx
+//
+// @param f The file stream
+// @param c Returns the character we moved back to as a unicode codepoint
+// @return Returns either @c RF_SUCCESS for success or one of the following errors:
+// + @c RE_FILE_POS_OVERFLOW: If during trying to read the current file's position it can't be represented by the system
+// + @c RE_FILE_BAD: If The file descriptor is corrupt/illegal
+// + @c RE_FILE_NOTFILE: If the file descriptor is not a file but something else. e.g. socket.
+// + @c RE_FILE_GETFILEPOS: If the file's position could not be retrieved for some unknown reason
+// + @c RE_FILE_WRITE_BLOCK: While attempting to move the file pointer, it was occupied by another thread, and the no block flag was set
+// + @c RE_INTERRUPT: Operating on the file failed due to a system interrupt
+// + @c RE_FILE_IO: There was a physical I/O error
+// + @c RE_FILE_NOSPACE: There was no space on the device holding the file
+// + @c RE_FILE_NOTFILE: The device we attempted to manipulate is non-existent
+// + @c RE_FILE_READ: If during reading the file there was an error
+// + @c RE_FILE_READ_BLOCK: If during reading the file the read operation failed due to the file being occupied by another thread
+// + @c RE_FILE_MODE: If during reading the file the underlying file descriptor's mode was not correctly set for reading
+i_DECLIMEX_ int32_t rfFback_UTF32LE(FILE* f,uint32_t *c);
+// @brief Moves a unicode character backwards in a big endian UTF-16 file stream
+//
+// The file descriptor at @c f must have been opened in <b>binary</b> and not text mode. That means that if under
+// Windows make sure to call fopen with "wb", "rb" e.t.c. instead of the simple "w", "r" e.t.c. since the initial
+// default value under Windows is text mode. Alternatively you can set the initial value using _get_fmode() and
+// _set_fmode(). For more information take a look at the msdn pages here:
+// http://msdn.microsoft.com/en-us/library/ktss1a9b.aspx
+//
+// @param f The file stream
+// @param c Returns the character we moved back to as a unicode codepoint
+// @return Returns either the number of bytes moved backwards (either @c 4 or @c 2) for success or one of the following errors:
+// + @c RE_UTF16_INVALID_SEQUENCE: Either the read word or its surrogate pair if 4 bytes were read held illegal values
+// + @c RE_FILE_POS_OVERFLOW: If during trying to read the current file's position it can't be represented by the system
+// + @c RE_FILE_BAD: If The file descriptor is corrupt/illegal
+// + @c RE_FILE_NOTFILE: If the file descriptor is not a file but something else. e.g. socket.
+// + @c RE_FILE_GETFILEPOS: If the file's position could not be retrieved for some unknown reason
+// + @c RE_FILE_WRITE_BLOCK: While attempting to move the file pointer, it was occupied by another thread, and the no block flag was set
+// + @c RE_INTERRUPT: Operating on the file failed due to a system interrupt
+// + @c RE_FILE_IO: There was a physical I/O error
+// + @c RE_FILE_NOSPACE: There was no space on the device holding the file
+// + @c RE_FILE_NOTFILE: The device we attempted to manipulate is non-existent
+// + @c RE_FILE_READ: If during reading the file there was an error
+// + @c RE_FILE_READ_BLOCK: If during reading the file the read operation failed due to the file being occupied by another thread
+// + @c RE_FILE_MODE: If during reading the file the underlying file descriptor's mode was not correctly set for reading
+i_DECLIMEX_ int32_t rfFback_UTF16BE(FILE* f,uint32_t *c);
+// @brief Moves a unicode character backwards in a little endian UTF-16 file stream
+//
+// The file descriptor at @c f must have been opened in <b>binary</b> and not text mode. That means that if under
+// Windows make sure to call fopen with "wb", "rb" e.t.c. instead of the simple "w", "r" e.t.c. since the initial
+// default value under Windows is text mode. Alternatively you can set the initial value using _get_fmode() and
+// _set_fmode(). For more information take a look at the msdn pages here:
+// http://msdn.microsoft.com/en-us/library/ktss1a9b.aspx
+//
+// @param f The file stream
+// @param c Returns the character we moved back to as a unicode codepoint
+// @return Returns either the number of bytes moved backwards (either @c 4 or @c 2) for success or one of the following errors:
+// + @c RE_UTF16_INVALID_SEQUENCE: Either the read word or its surrogate pair if 4 bytes were read held illegal values
+// + @c RE_FILE_POS_OVERFLOW: If during trying to read the current file's position it can't be represented by the system
+// + @c RE_FILE_BAD: If The file descriptor is corrupt/illegal
+// + @c RE_FILE_NOTFILE: If the file descriptor is not a file but something else. e.g. socket.
+// + @c RE_FILE_GETFILEPOS: If the file's position could not be retrieved for some unknown reason
+// + @c RE_FILE_WRITE_BLOCK: While attempting to move the file pointer, it was occupied by another thread, and the no block flag was set
+// + @c RE_INTERRUPT: Operating on the file failed due to a system interrupt
+// + @c RE_FILE_IO: There was a physical I/O error
+// + @c RE_FILE_NOSPACE: There was no space on the device holding the file
+// + @c RE_FILE_NOTFILE: The device we attempted to manipulate is non-existent
+// + @c RE_FILE_READ: If during reading the file there was an error
+// + @c RE_FILE_READ_BLOCK: If during reading the file the read operation failed due to the file being occupied by another thread
+// + @c RE_FILE_MODE: If during reading the file the underlying file descriptor's mode was not correctly set for reading
+i_DECLIMEX_ int32_t rfFback_UTF16LE(FILE* f,uint32_t *c);
+// @brief Moves a unicode character backwards in a UTF-8 file stream
+//
+// The file descriptor at @c f must have been opened in <b>binary</b> and not text mode. That means that if under
+// Windows make sure to call fopen with "wb", "rb" e.t.c. instead of the simple "w", "r" e.t.c. since the initial
+// default value under Windows is text mode. Alternatively you can set the initial value using _get_fmode() and
+// _set_fmode(). For more information take a look at the msdn pages here:
+// http://msdn.microsoft.com/en-us/library/ktss1a9b.aspx
+//
+// @param f The file stream
+// @param c Returns the character we moved back to as a unicode codepoint
+// @return Returns either the number of bytes moved backwards for success (either @c 4, @c 3, @c 2 or @c 1) or one of the following errors:
+// + @c RE_UTF8_INVALID_SEQUENCE: If during moving bacwards in the file unexpected UTF-8 bytes were found
+// + @c RE_FILE_POS_OVERFLOW: If during trying to read the current file's position it can't be represented by the system
+// + @c RE_FILE_BAD: If The file descriptor is corrupt/illegal
+// + @c RE_FILE_NOTFILE: If the file descriptor is not a file but something else. e.g. socket.
+// + @c RE_FILE_GETFILEPOS: If the file's position could not be retrieved for some unknown reason
+// + @c RE_FILE_WRITE_BLOCK: While attempting to move the file pointer, it was occupied by another thread, and the no block flag was set
+// + @c RE_INTERRUPT: Operating on the file failed due to a system interrupt
+// + @c RE_FILE_IO: There was a physical I/O error
+// + @c RE_FILE_NOSPACE: There was no space on the device holding the file
+// + @c RE_FILE_NOTFILE: The device we attempted to manipulate is non-existent
+// + @c RE_FILE_READ: If during reading the file there was an error
+// + @c RE_FILE_READ_BLOCK: If during reading the file the read operation failed due to the file being occupied by another thread
+// + @c RE_FILE_MODE: If during reading the file the underlying file descriptor's mode was not correctly set for reading
+i_DECLIMEX_ int32_t rfFback_UTF8(FILE* f,uint32_t *c);
+
+// @brief Opens another process as a pipe
+//
+// This function is a cross-platform popen wrapper. In linux it uses popen and in Windows it uses
+// _popen.
+// @lmsFunction
+// @param command The string with the command to execute. Is basically the name of the program/process you want to spawn
+// with its full path and its parameters. @inhtype{String,StringX} @tmpSTR
+// @param mode The mode you want the pipe to work in. There are two possible values:
+// + @c "r" The calling process can read the spawned command's standard output via the returned stream.
+// + @c "w" The calling process can write to the spawned command's standard input via the returned stream.
+//
+// Anything else will result in an error
+// @return For success popen will return a FILE descriptor that can be used to either read or write from the pipe.
+// If there was an error @c 0 is returned and an error is logged.
+#ifdef RF_IAMHERE_FOR_DOXYGEN
+i_DECLIMEX_ FILE* rfPopen(void* command,const char* mode);
+#else
+i_DECLIMEX_ FILE* i_rfPopen(void* command,const char* mode);
+#define rfPopen(i_CMD_,i_MODE_) i_rfLMS_WRAP2(FILE*,i_rfPopen,i_CMD_,i_MODE_)
+#endif
+
+// @brief Closes a pipe
+//
+// This function is a cross-platform wrapper for pclose. It closes a file descriptor opened with @ref rfPopen() and
+// returns the exit code of the process that was running
+// @param stream The file descriptor of the pipe returned by @ref rfPopen() that we want to close
+// @return Returns the exit code of the process or -1 if there was an error
+i_DECLIMEX_ int rfPclose(FILE* stream);
+
+// @} End of I/O group
+
+#ifdef __cplusplus
+}///closing bracket for calling from C++
+#endif
+
+
+#endif//include guards end
--- a/samples/C/rfc_string.c
+++ b/samples/C/rfc_string.c
--- a/samples/C/rfc_string.h
+++ b/samples/C/rfc_string.h
--- a/samples/C/wglew.h
+++ b/samples/C/wglew.h
--- a/samples/Shell/sbt.script!
+++ b/samples/Shell/sbt.script!
@@ -0,0 +1,432 @@
+#!/usr/bin/env bash
+#
+# A more capable sbt runner, coincidentally also called sbt.
+# Author: Paul Phillips <paulp@typesafe.com>
+
+# todo - make this dynamic
+declare -r sbt_release_version=0.11.3
+declare -r sbt_snapshot_version=0.13.0-SNAPSHOT
+
+unset sbt_jar sbt_dir sbt_create sbt_snapshot sbt_launch_dir
+unset scala_version java_home sbt_explicit_version
+unset verbose debug quiet
+
+build_props_sbt () {
+  if [[ -f project/build.properties ]]; then
+    versionLine=$(grep ^sbt.version project/build.properties)
+    versionString=${versionLine##sbt.version=}
+    echo "$versionString"
+  fi
+}
+
+update_build_props_sbt () {
+  local ver="$1"
+  local old=$(build_props_sbt)
+
+  if [[ $ver == $old ]]; then
+    return
+  elif [[ -f project/build.properties ]]; then
+    perl -pi -e "s/^sbt\.version=.*\$/sbt.version=${ver}/" project/build.properties
+    grep -q '^sbt.version=' project/build.properties || echo "sbt.version=${ver}" >> project/build.properties
+
+    echo !!!
+    echo !!! Updated file project/build.properties setting sbt.version to: $ver
+    echo !!! Previous value was: $old
+    echo !!!
+  fi
+}
+
+sbt_version () {
+  if [[ -n $sbt_explicit_version ]]; then
+    echo $sbt_explicit_version
+  else
+    local v=$(build_props_sbt)
+    if [[ -n $v ]]; then
+      echo $v
+    else
+      echo $sbt_release_version
+    fi
+  fi
+}
+
+echoerr () {
+  echo 1>&2 "$@"
+}
+vlog () {
+  [[ $verbose || $debug ]] && echoerr "$@"
+}
+dlog () {
+  [[ $debug ]] && echoerr "$@"
+}
+
+# this seems to cover the bases on OSX, and someone will
+# have to tell me about the others.
+get_script_path () {
+  local path="$1"
+  [[ -L "$path" ]] || { echo "$path" ; return; }
+
+  local target=$(readlink "$path")
+  if [[ "${target:0:1}" == "/" ]]; then
+    echo "$target"
+  else
+    echo "$(dirname $path)/$target"
+  fi
+}
+
+# a ham-fisted attempt to move some memory settings in concert
+# so they need not be dicked around with individually.
+get_mem_opts () {
+  local mem=${1:-1536}
+  local perm=$(( $mem / 4 ))
+  (( $perm > 256 )) || perm=256
+  (( $perm < 1024 )) || perm=1024
+  local codecache=$(( $perm / 2 ))
+  
+  echo "-Xms${mem}m -Xmx${mem}m -XX:MaxPermSize=${perm}m -XX:ReservedCodeCacheSize=${codecache}m"
+}
+
+die() {
+  echo "Aborting: $@"
+  exit 1
+}
+
+make_url () {
+  groupid="$1"
+  category="$2"
+  version="$3"
+  
+  echo "http://typesafe.artifactoryonline.com/typesafe/ivy-$category/$groupid/sbt-launch/$version/sbt-launch.jar"
+}
+
+declare -r default_jvm_opts="-Dfile.encoding=UTF8"
+declare -r default_sbt_opts="-XX:+CMSClassUnloadingEnabled"
+declare -r default_sbt_mem=1536
+declare -r noshare_opts="-Dsbt.global.base=project/.sbtboot -Dsbt.boot.directory=project/.boot -Dsbt.ivy.home=project/.ivy"
+declare -r sbt_opts_file=".sbtopts"
+declare -r jvm_opts_file=".jvmopts"
+declare -r latest_28="2.8.2"
+declare -r latest_29="2.9.1"
+declare -r latest_210="2.10.0-SNAPSHOT"
+
+declare -r script_path=$(get_script_path "$BASH_SOURCE")
+declare -r script_dir="$(dirname $script_path)"
+declare -r script_name="$(basename $script_path)"
+
+# some non-read-onlies set with defaults
+declare java_cmd=java
+declare sbt_launch_dir="$script_dir/.lib"
+declare sbt_mem=$default_sbt_mem
+
+# pull -J and -D options to give to java.
+declare -a residual_args
+declare -a java_args
+declare -a scalac_args
+declare -a sbt_commands
+
+build_props_scala () {
+  if [[ -f project/build.properties ]]; then
+    versionLine=$(grep ^build.scala.versions project/build.properties)
+    versionString=${versionLine##build.scala.versions=}
+    echo ${versionString%% .*}
+  fi
+}
+
+execRunner () {
+  # print the arguments one to a line, quoting any containing spaces
+  [[ $verbose || $debug ]] && echo "# Executing command line:" && {
+    for arg; do
+      if printf "%s\n" "$arg" | grep -q ' '; then
+        printf "\"%s\"\n" "$arg"
+      else
+        printf "%s\n" "$arg"
+      fi
+    done
+    echo ""
+  }
+
+  exec "$@"
+}
+
+sbt_groupid () {
+  case $(sbt_version) in
+        0.7.*) echo org.scala-tools.sbt ;;
+       0.10.*) echo org.scala-tools.sbt ;;
+    0.11.[12]) echo org.scala-tools.sbt ;;
+            *) echo org.scala-sbt ;;
+  esac
+}
+
+sbt_artifactory_list () {
+  local version0=$(sbt_version)
+  local version=${version0%-SNAPSHOT}
+  local url="http://typesafe.artifactoryonline.com/typesafe/ivy-snapshots/$(sbt_groupid)/sbt-launch/"
+  dlog "Looking for snapshot list at: $url "
+  
+  curl -s --list-only "$url" | \
+    grep -F $version | \
+    perl -e 'print reverse <>' | \
+    perl -pe 's#^<a href="([^"/]+).*#$1#;'
+}
+
+make_release_url () {
+  make_url $(sbt_groupid) releases $(sbt_version)
+}
+
+# argument is e.g. 0.13.0-SNAPSHOT
+# finds the actual version (with the build id) at artifactory
+make_snapshot_url () {
+  for ver in $(sbt_artifactory_list); do
+    local url=$(make_url $(sbt_groupid) snapshots $ver)
+    dlog "Testing $url"
+    curl -s --head "$url" >/dev/null
+    dlog "curl returned: $?"
+    echo "$url"
+    return
+  done
+}
+
+jar_url () {
+  case $(sbt_version) in
+             0.7.*) echo "http://simple-build-tool.googlecode.com/files/sbt-launch-0.7.7.jar" ;;
+        *-SNAPSHOT) make_snapshot_url ;;
+                 *) make_release_url ;;
+  esac
+}
+
+jar_file () {
+  echo "$sbt_launch_dir/$1/sbt-launch.jar"
+}
+
+download_url () {
+  local url="$1"
+  local jar="$2"
+  
+  echo "Downloading sbt launcher $(sbt_version):"
+  echo "  From  $url"
+  echo "    To  $jar"
+
+  mkdir -p $(dirname "$jar") && {
+    if which curl >/dev/null; then
+      curl --fail --silent "$url" --output "$jar"
+    elif which wget >/dev/null; then
+      wget --quiet -O "$jar" "$url"
+    fi
+  } && [[ -f "$jar" ]]
+}
+
+acquire_sbt_jar () {
+  sbt_url="$(jar_url)"
+  sbt_jar="$(jar_file $(sbt_version))"
+
+  [[ -f "$sbt_jar" ]] || download_url "$sbt_url" "$sbt_jar"
+}
+
+usage () {
+  cat <<EOM
+Usage: $script_name [options]
+
+  -h | -help         print this message
+  -v | -verbose      this runner is chattier
+  -d | -debug        set sbt log level to Debug
+  -q | -quiet        set sbt log level to Error
+  -no-colors         disable ANSI color codes
+  -sbt-create        start sbt even if current directory contains no sbt project
+  -sbt-dir   <path>  path to global settings/plugins directory (default: ~/.sbt/<version>)
+  -sbt-boot  <path>  path to shared boot directory (default: ~/.sbt/boot in 0.11 series)
+  -ivy       <path>  path to local Ivy repository (default: ~/.ivy2)
+  -mem    <integer>  set memory options (default: $sbt_mem, which is
+                       $(get_mem_opts $sbt_mem) )
+  -no-share          use all local caches; no sharing
+  -offline           put sbt in offline mode
+  -jvm-debug <port>  Turn on JVM debugging, open at the given port.
+  -batch             Disable interactive mode
+
+  # sbt version (default: from project/build.properties if present, else latest release)
+  !!! The only way to accomplish this pre-0.12.0 if there is a build.properties file which
+  !!! contains an sbt.version property is to update the file on disk.  That's what this does.
+  -sbt-version  <version>   use the specified version of sbt 
+  -sbt-jar      <path>      use the specified jar as the sbt launcher
+  -sbt-snapshot             use a snapshot version of sbt
+  -sbt-launch-dir <path>    directory to hold sbt launchers (default: $sbt_launch_dir)
+
+  # scala version (default: as chosen by sbt)
+  -28                       use $latest_28
+  -29                       use $latest_29
+  -210                      use $latest_210
+  -scala-home <path>        use the scala build at the specified directory
+  -scala-version <version>  use the specified version of scala
+
+  # java version (default: java from PATH, currently $(java -version |& grep version))
+  -java-home <path>         alternate JAVA_HOME
+
+  # jvm options and output control
+  JAVA_OPTS     environment variable holding jvm args, if unset uses "$default_jvm_opts"
+  SBT_OPTS      environment variable holding jvm args, if unset uses "$default_sbt_opts"
+  .jvmopts      if file is in sbt root, it is prepended to the args given to the jvm
+  .sbtopts      if file is in sbt root, it is prepended to the args given to **sbt**
+  -Dkey=val     pass -Dkey=val directly to the jvm
+  -J-X          pass option -X directly to the jvm (-J is stripped)
+  -S-X          add -X to sbt's scalacOptions (-S is stripped)
+
+In the case of duplicated or conflicting options, the order above
+shows precedence: JAVA_OPTS lowest, command line options highest.
+EOM
+}
+
+addJava () {
+  dlog "[addJava] arg = '$1'"
+  java_args=( "${java_args[@]}" "$1" )
+}
+addSbt () {
+  dlog "[addSbt] arg = '$1'"
+  sbt_commands=( "${sbt_commands[@]}" "$1" )
+}
+addScalac () {
+  dlog "[addScalac] arg = '$1'"
+  scalac_args=( "${scalac_args[@]}" "$1" )
+}
+addResidual () {
+  dlog "[residual] arg = '$1'"
+  residual_args=( "${residual_args[@]}" "$1" )
+}
+addResolver () {
+  addSbt "set resolvers in ThisBuild += $1"
+}
+addDebugger () {
+  addJava "-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=$1"
+}
+get_jvm_opts () {
+  # echo "${JAVA_OPTS:-$default_jvm_opts}"
+  # echo "${SBT_OPTS:-$default_sbt_opts}"
+
+  [[ -f "$jvm_opts_file" ]] && cat "$jvm_opts_file"
+}
+
+process_args ()
+{
+  require_arg () {
+    local type="$1"
+    local opt="$2"
+    local arg="$3"
+    
+    if [[ -z "$arg" ]] || [[ "${arg:0:1}" == "-" ]]; then
+      die "$opt requires <$type> argument"
+    fi
+  }
+  while [[ $# -gt 0 ]]; do
+    case "$1" in
+       -h|-help) usage; exit 1 ;;
+    -v|-verbose) verbose=1 && shift ;;
+      -d|-debug) debug=1 && shift ;;
+      -q|-quiet) quiet=1 && shift ;;
+
+           -ivy) require_arg path "$1" "$2" && addJava "-Dsbt.ivy.home=$2" && shift 2 ;;
+           -mem) require_arg integer "$1" "$2" && sbt_mem="$2" && shift 2 ;;
+     -no-colors) addJava "-Dsbt.log.noformat=true" && shift ;;
+      -no-share) addJava "$noshare_opts" && shift ;;
+      -sbt-boot) require_arg path "$1" "$2" && addJava "-Dsbt.boot.directory=$2" && shift 2 ;;
+       -sbt-dir) require_arg path "$1" "$2" && sbt_dir="$2" && shift 2 ;;
+     -debug-inc) addJava "-Dxsbt.inc.debug=true" && shift ;;
+       -offline) addSbt "set offline := true" && shift ;;
+     -jvm-debug) require_arg port "$1" "$2" && addDebugger $2 && shift 2 ;;
+         -batch) exec </dev/null && shift ;;
+
+    -sbt-create) sbt_create=true && shift ;;
+  -sbt-snapshot) sbt_explicit_version=$sbt_snapshot_version && shift ;;
+       -sbt-jar) require_arg path "$1" "$2" && sbt_jar="$2" && shift 2 ;;
+   -sbt-version) require_arg version "$1" "$2" && sbt_explicit_version="$2" && shift 2 ;;
+-sbt-launch-dir) require_arg path "$1" "$2" && sbt_launch_dir="$2" && shift 2 ;;
+ -scala-version) require_arg version "$1" "$2" && addSbt "set scalaVersion := \"$2\"" && shift 2 ;;
+    -scala-home) require_arg path "$1" "$2" && addSbt "set scalaHome in ThisBuild := Some(file(\"$2\"))" && shift 2 ;;
+     -java-home) require_arg path "$1" "$2" && java_cmd="$2/bin/java" && shift 2 ;;
+
+            -D*) addJava "$1" && shift ;;
+            -J*) addJava "${1:2}" && shift ;;
+            -S*) addScalac "${1:2}" && shift ;;
+            -28) addSbt "++ $latest_28" && shift ;;
+            -29) addSbt "++ $latest_29" && shift ;;
+           -210) addSbt "++ $latest_210" && shift ;;
+
+              *) addResidual "$1" && shift ;;
+    esac
+  done
+  
+  [[ $debug ]] && {
+    case $(sbt_version) in
+     0.7.*) addSbt "debug" ;; 
+         *) addSbt "set logLevel in Global := Level.Debug" ;;
+    esac
+  }
+  [[ $quiet ]] && {
+    case $(sbt_version) in
+     0.7.*) ;; 
+         *) addSbt "set logLevel in Global := Level.Error" ;;
+    esac
+  }
+}
+
+# if .sbtopts exists, prepend its contents to $@ so it can be processed by this runner
+[[ -f "$sbt_opts_file" ]] && {
+  sbtargs=()
+  while IFS= read -r arg; do
+    sbtargs=( "${sbtargs[@]}" "$arg" )
+  done <"$sbt_opts_file"
+
+  set -- "${sbtargs[@]}" "$@"
+}
+
+# process the combined args, then reset "$@" to the residuals
+process_args "$@"
+set -- "${residual_args[@]}"
+argumentCount=$#
+
+# set scalacOptions if we were given any -S opts
+[[ ${#scalac_args[@]} -eq 0 ]] || addSbt "set scalacOptions in ThisBuild += \"${scalac_args[@]}\""
+
+# Update build.properties no disk to set explicit version - sbt gives us no choice
+[[ -n "$sbt_explicit_version" ]] && update_build_props_sbt "$sbt_explicit_version"
+echo "Detected sbt version $(sbt_version)"
+
+[[ -n "$scala_version" ]] && echo "Overriding scala version to $scala_version"
+
+# no args - alert them there's stuff in here
+(( $argumentCount > 0 )) || echo "Starting $script_name: invoke with -help for other options"
+
+# verify this is an sbt dir or -create was given
+[[ -f ./build.sbt || -d ./project || -n "$sbt_create" ]] || {
+  cat <<EOM
+$(pwd) doesn't appear to be an sbt project.
+If you want to start sbt anyway, run:
+  $0 -sbt-create
+
+EOM
+  exit 1
+}
+
+# pick up completion if present; todo
+[[ -f .sbt_completion.sh ]] && source .sbt_completion.sh
+
+# no jar? download it.
+[[ -f "$sbt_jar" ]] || acquire_sbt_jar || {
+  # still no jar? uh-oh.
+  echo "Download failed. Obtain the jar manually and place it at $sbt_jar"
+  exit 1
+}
+
+[[ -n "$sbt_dir" ]] || {
+  sbt_dir=~/.sbt/$(sbt_version)
+  addJava "-Dsbt.global.base=$sbt_dir"
+  echo "Using $sbt_dir as sbt dir, -sbt-dir to override."
+}
+
+# since sbt 0.7 doesn't understand iflast
+(( ${#residual_args[@]} == 0 )) && residual_args=( "shell" )
+
+# run sbt
+execRunner "$java_cmd" \
+  $(get_mem_opts $sbt_mem) \
+  $(get_jvm_opts) \
+  ${java_args[@]} \
+  -jar "$sbt_jar" \
+  "${sbt_commands[@]}" \
+  "${residual_args[@]}"
--- a/test/test_blob.rb
+++ b/test/test_blob.rb
@@ -2,6 +2,7 @@ require 'linguist/file_blob'
 require 'linguist/samples'

 require 'test/unit'
+require 'mocha'
 require 'mime/types'
 require 'pygments'

@@ -261,6 +262,12 @@ class TestBlob < Test::Unit::TestCase
    assert !blob("Text/dump.sql").indexable?
    assert !blob("Binary/github.po").indexable?
    assert !blob("Binary/linguist.gem").indexable?
+
+    # large binary blobs should fail on size check first, not call 
+    # into charlock_holmes and alloc big buffers for testing encoding
+    b = blob("Binary/octocat.ai")
+    b.expects(:binary?).never
+    assert !b.indexable?
  end

  def test_language
Author	SHA1	Message	Date
Scott J. Goldman	fc435a2541	Linguist 2.3.2	2012-09-02 00:08:37 -07:00
Scott J. Goldman	04394750e7	When testing if a blob is safe to colorize, check size first Similar to `e415a13`	2012-09-02 00:08:37 -07:00
Scott J. Goldman	e415a1351b	When testing if a blob is indexable, check size first Otherwise, charlock_holmes will allocate another large binary buffer for testing the encoding, which is a problem if the binary blob is many hundreds of MB large. It'll just fail and crash ruby.	2012-08-31 22:47:19 -07:00
Joshua Peek	6ec907a915	Merge pull request #245 from jcazevedo/master Add Shell sample	2012-08-28 10:55:11 -07:00
Joao Azevedo	1f55f01fa9	Add Shell sample	2012-08-28 18:01:46 +01:00
Joshua Peek	5d79b88875	Linguist 2.3.1	2012-08-27 11:34:55 -05:00
Joshua Peek	458890b4b9	Add C++ sample	2012-08-27 11:33:28 -05:00
Joshua Peek	89267f792d	Rebuild samples db	2012-08-27 11:30:44 -05:00
Joshua Peek	b183fcca05	Only read up to 100KB	2012-08-27 11:30:38 -05:00
Joshua Peek	684a57dbc0	Add another C sample	2012-08-27 11:21:57 -05:00
Joshua Peek	400086a5c8	Add more C samples Closes #237	2012-08-23 13:38:16 -05:00