- About Scala
- Documentation
- Code Examples
- Software
- Scala Developers
Parsing byte sequences
Tue, 2009-12-29, 14:36
Hi,
Anyone know how to use the parser combinators on byte arrays and InputStreams? I want to write a parser that will parse bytes rather than characters.
Cheers,
-John
Anyone know how to use the parser combinators on byte arrays and InputStreams? I want to write a parser that will parse bytes rather than characters.
Cheers,
-John
Tue, 2009-12-29, 15:17
#2
Re: Parsing byte sequences
If you just want it to work (as opposed to being incredibly efficient), then use
scala.util.parsing.input.StreamReader( ... )
to create a parser-compatible reader from any Java reader. (This uses apply(...) from the companion object; it's not a constructor call.)
To read an InputStream, use
new java.io.InputStreamReader( myInputStream )
in place of the "..." above.
To read a byte array, wrap your byte array in a ByteArrayInputStream:
new java.io.InputStreamReader( new java.io.ByteArrayInputStream( myArray ) )
In this case, though, if you really wanted/needed efficiency, you'd likely be better off by subclassing Reader[Char] to use a byte array as a buffer. I don't know whether you'd need to wrap your byte array in something that made it obey the java.lang.CharSequence interface; if so, you could just do that directly and use scala.util.parsing.input.CharSequenceReader to get the reader for the parser.
--Rex
On Tue, Dec 29, 2009 at 8:36 AM, John Ky <newhoggy [at] gmail [dot] com> wrote:
scala.util.parsing.input.StreamReader( ... )
to create a parser-compatible reader from any Java reader. (This uses apply(...) from the companion object; it's not a constructor call.)
To read an InputStream, use
new java.io.InputStreamReader( myInputStream )
in place of the "..." above.
To read a byte array, wrap your byte array in a ByteArrayInputStream:
new java.io.InputStreamReader( new java.io.ByteArrayInputStream( myArray ) )
In this case, though, if you really wanted/needed efficiency, you'd likely be better off by subclassing Reader[Char] to use a byte array as a buffer. I don't know whether you'd need to wrap your byte array in something that made it obey the java.lang.CharSequence interface; if so, you could just do that directly and use scala.util.parsing.input.CharSequenceReader to get the reader for the parser.
--Rex
On Tue, Dec 29, 2009 at 8:36 AM, John Ky <newhoggy [at] gmail [dot] com> wrote:
Hi,
Anyone know how to use the parser combinators on byte arrays and InputStreams? I want to write a parser that will parse bytes rather than characters.
Cheers,
-John
Tue, 2009-12-29, 17:17
#3
Re: Parsing byte sequences
On Wed, Dec 30, 2009 at 12:36:07AM +1100, John Ky wrote:
> Anyone know how to use the parser combinators on byte arrays and
> InputStreams? I want to write a parser that will parse bytes rather
> than characters.
I happen to be working on a very entertaining project involving that at
this very moment. I'll publish this all pretty soon but here are a few
classes I use to make the standard library seem more byte oriented.
(Some of this is specific to my project.)
import scala.util.parsing.combinator._
import scala.util.parsing.input.{ Position, Reader }
import scala.util.parsing.input.CharArrayReader.EofCh
import scala.annotation.tailrec
import java.lang.Float.intBitsToFloat
import java.lang.Double.longBitsToDouble
trait ParsersUtil extends Parsers {
lazy val anyElem: Parser[Elem] = elem("anyElem", _ => true)
def elemExcept(xs: Elem*): Parser[Elem] = elem("elemExcept", x => !(xs contains x))
def elemOf(xs: Elem*): Parser[Elem] = elem("elemOf", xs contains _)
def take(n: Int): Parser[Seq[Elem]] = repN(n, anyElem)
def takeUntil(cond: Parser[Elem]): Parser[Seq[Elem]] = takeUntil(cond, anyElem)
def takeUntil(cond: Parser[Elem], p: Parser[Elem]): Parser[Seq[Elem]] = rep(not(cond) ~> p)
def takeWhile(p: Parser[Elem]): Parser[Seq[Elem]] = rep(p)
}
case class ByteOffsetPosition(offset: Int) extends Position {
final val line = 1
def column = offset + 1
def lineContents: String = ""
}
class ByteReader(val bytes: Array[Byte], override val offset: Int) extends Reader[Byte] {
def this(reader: Reader[_]) = this(reader.source.toString.getBytes, 0)
def this(bytes: Seq[Byte]) = this(bytes.toArray, 0)
def this(str: String) = this(str.getBytes, 0)
override def source = bytes map (_.toChar)
def first: Byte = if (offset < bytes.length) bytes(offset) else EofCh.toByte
def rest: ByteReader = if (offset < bytes.length) new ByteReader(bytes, offset + 1) else this
def pos: Position = ByteOffsetPosition(offset)
def atEnd = offset >= bytes.length
def byteAt(n: Int) = bytes(n)
def length = bytes.length - offset
override def drop(n: Int): ByteReader = new ByteReader(bytes, offset + n)
def take(n: Int): Seq[Byte] = bytes drop offset take n
override def toString = "ByteReader(%d / %d)".format(offset, bytes.length)
}
trait BinaryParsers extends Parsers with ParsersUtil {
type Elem = Byte
protected implicit def readerToByteReader(x: Input): ByteReader = x match {
case br: ByteReader => br
case _ => new ByteReader(x)
}
def toInt(bytes: Seq[Byte]): Int = bytes.foldLeft(0)((x, b) => (x << 8) + (b & 0xFF))
def toLong(bytes: Seq[Byte]): Long = bytes.foldLeft(0L)((x, b) => (x << 8) + (b & 0xFF))
lazy val byte: Parser[Byte] = anyElem
lazy val u1: Parser[Int] = byte ^^ (_ & 0xFF)
lazy val u2: Parser[Int] = bytes(2) ^^ toInt
lazy val u4: Parser[Int] = bytes(4) ^^ toInt
lazy val u4f: Parser[Float] = u4 ^^ intBitsToFloat
lazy val u8: Parser[Long] = bytes(8) ^^ toLong
lazy val u8d: Parser[Double] = u8 ^^ longBitsToDouble
def bytes(n: Int): Parser[Seq[Byte]] = Parser { in =>
if (n <= in.length) Success(in take n, in drop n)
else Failure("Requested %d bytes but only %d remain".format(n, in.length), in)
}
def parse[T](p: Parser[T], in: Input): ParseResult[T] = p(in)
def parse[T](p: Parser[T], in: String): ParseResult[T] = parse(p, new ByteReader(in))
}
Wed, 2009-12-30, 03:07
#4
Re: Parsing byte sequences
Hi Paul,
Thanks for that. It works well. It would be nice if the Scala Library included some of this basic functionality.
If your work makes processing binary data easier, I'd certainly be interested in taking a look when you publish it.
Cheers,
-John
On Wed, Dec 30, 2009 at 3:07 AM, Paul Phillips <paulp [at] improving [dot] org> wrote:
Thanks for that. It works well. It would be nice if the Scala Library included some of this basic functionality.
If your work makes processing binary data easier, I'd certainly be interested in taking a look when you publish it.
Cheers,
-John
On Wed, Dec 30, 2009 at 3:07 AM, Paul Phillips <paulp [at] improving [dot] org> wrote:
On Wed, Dec 30, 2009 at 12:36:07AM +1100, John Ky wrote:
> Anyone know how to use the parser combinators on byte arrays and
> InputStreams? I want to write a parser that will parse bytes rather
> than characters.
I happen to be working on a very entertaining project involving that at
this very moment. I'll publish this all pretty soon but here are a few
classes I use to make the standard library seem more byte oriented.
(Some of this is specific to my project.)
import scala.util.parsing.combinator._
import scala.util.parsing.input.{ Position, Reader }
import scala.util.parsing.input.CharArrayReader.EofCh
import scala.annotation.tailrec
import java.lang.Float.intBitsToFloat
import java.lang.Double.longBitsToDouble
trait ParsersUtil extends Parsers {
lazy val anyElem: Parser[Elem] = elem("anyElem", _ => true)
def elemExcept(xs: Elem*): Parser[Elem] = elem("elemExcept", x => !(xs contains x))
def elemOf(xs: Elem*): Parser[Elem] = elem("elemOf", xs contains _)
def take(n: Int): Parser[Seq[Elem]] = repN(n, anyElem)
def takeUntil(cond: Parser[Elem]): Parser[Seq[Elem]] = takeUntil(cond, anyElem)
def takeUntil(cond: Parser[Elem], p: Parser[Elem]): Parser[Seq[Elem]] = rep(not(cond) ~> p)
def takeWhile(p: Parser[Elem]): Parser[Seq[Elem]] = rep(p)
}
case class ByteOffsetPosition(offset: Int) extends Position {
final val line = 1
def column = offset + 1
def lineContents: String = ""
}
class ByteReader(val bytes: Array[Byte], override val offset: Int) extends Reader[Byte] {
def this(reader: Reader[_]) = this(reader.source.toString.getBytes, 0)
def this(bytes: Seq[Byte]) = this(bytes.toArray, 0)
def this(str: String) = this(str.getBytes, 0)
override def source = bytes map (_.toChar)
def first: Byte = if (offset < bytes.length) bytes(offset) else EofCh.toByte
def rest: ByteReader = if (offset < bytes.length) new ByteReader(bytes, offset + 1) else this
def pos: Position = ByteOffsetPosition(offset)
def atEnd = offset >= bytes.length
def byteAt(n: Int) = bytes(n)
def length = bytes.length - offset
override def drop(n: Int): ByteReader = new ByteReader(bytes, offset + n)
def take(n: Int): Seq[Byte] = bytes drop offset take n
override def toString = "ByteReader(%d / %d)".format(offset, bytes.length)
}
trait BinaryParsers extends Parsers with ParsersUtil {
type Elem = Byte
protected implicit def readerToByteReader(x: Input): ByteReader = x match {
case br: ByteReader => br
case _ => new ByteReader(x)
}
def toInt(bytes: Seq[Byte]): Int = bytes.foldLeft(0)((x, b) => (x << 8) + (b & 0xFF))
def toLong(bytes: Seq[Byte]): Long = bytes.foldLeft(0L)((x, b) => (x << 8) + (b & 0xFF))
lazy val byte: Parser[Byte] = anyElem
lazy val u1: Parser[Int] = byte ^^ (_ & 0xFF)
lazy val u2: Parser[Int] = bytes(2) ^^ toInt
lazy val u4: Parser[Int] = bytes(4) ^^ toInt
lazy val u4f: Parser[Float] = u4 ^^ intBitsToFloat
lazy val u8: Parser[Long] = bytes(8) ^^ toLong
lazy val u8d: Parser[Double] = u8 ^^ longBitsToDouble
def bytes(n: Int): Parser[Seq[Byte]] = Parser { in =>
if (n <= in.length) Success(in take n, in drop n)
else Failure("Requested %d bytes but only %d remain".format(n, in.length), in)
}
def parse[T](p: Parser[T], in: Input): ParseResult[T] = p(in)
def parse[T](p: Parser[T], in: String): ParseResult[T] = parse(p, new ByteReader(in))
}
--
Paul Phillips | Before a man speaks it is always safe to assume
Everyman | that he is a fool. After he speaks, it is seldom
Empiricist | necessary to assume it.
slap pi uphill! | -- H. L. Mencken
Wed, 2009-12-30, 23:57
#5
Re: Parsing byte sequences
Hi Paul,
I had an issue with EOF, where EofCh.toByte was being consumed by my parser as 26. My parsers therefore fail to parse the EOF properly. I fixed this by using an exception like this:
BinaryReader.scala:
def first: Byte = {
if (offset < bytes.length) {
bytes(offset)
} else {
throw EofException
}
}
BinaryParsers.scala:
override def acceptIf(p: Elem => Boolean)(err: Elem => String): Parser[Elem] = Parser { in =>
try {
if (p(in.first)) {
Success(in.first, in.rest)
} else {
Failure(err(in.first), in)
}
} catch {
case e if e eq EofException => Failure("EOF unexpected", in)
}
}
override def acceptMatch[U](expected: String, f: PartialFunction[Elem, U]): Parser[U] = Parser{ in =>
try {
if (f.isDefinedAt(in.first)) {
Success(f(in.first), in.rest)
} else {
Failure(expected + " expected", in)
}
} catch {
case e if e eq EofException => Failure("EOF unexpected: " + expected + " expected", in)
}
}
Because I reuse the EofException object all the time, it shouldn't be too expensive.
Cheers,
-John
2009/12/30 Paul Phillips <paulp [at] improving [dot] org>
I had an issue with EOF, where EofCh.toByte was being consumed by my parser as 26. My parsers therefore fail to parse the EOF properly. I fixed this by using an exception like this:
BinaryReader.scala:
def first: Byte = {
if (offset < bytes.length) {
bytes(offset)
} else {
throw EofException
}
}
BinaryParsers.scala:
override def acceptIf(p: Elem => Boolean)(err: Elem => String): Parser[Elem] = Parser { in =>
try {
if (p(in.first)) {
Success(in.first, in.rest)
} else {
Failure(err(in.first), in)
}
} catch {
case e if e eq EofException => Failure("EOF unexpected", in)
}
}
override def acceptMatch[U](expected: String, f: PartialFunction[Elem, U]): Parser[U] = Parser{ in =>
try {
if (f.isDefinedAt(in.first)) {
Success(f(in.first), in.rest)
} else {
Failure(expected + " expected", in)
}
} catch {
case e if e eq EofException => Failure("EOF unexpected: " + expected + " expected", in)
}
}
Because I reuse the EofException object all the time, it shouldn't be too expensive.
Cheers,
-John
2009/12/30 Paul Phillips <paulp [at] improving [dot] org>
On Wed, Dec 30, 2009 at 12:36:07AM +1100, John Ky wrote:
> Anyone know how to use the parser combinators on byte arrays and
> InputStreams? I want to write a parser that will parse bytes rather
> than characters.
I happen to be working on a very entertaining project involving that at
this very moment. I'll publish this all pretty soon but here are a few
classes I use to make the standard library seem more byte oriented.
(Some of this is specific to my project.)
import scala.util.parsing.combinator._
import scala.util.parsing.input.{ Position, Reader }
import scala.util.parsing.input.CharArrayReader.EofCh
import scala.annotation.tailrec
import java.lang.Float.intBitsToFloat
import java.lang.Double.longBitsToDouble
trait ParsersUtil extends Parsers {
lazy val anyElem: Parser[Elem] = elem("anyElem", _ => true)
def elemExcept(xs: Elem*): Parser[Elem] = elem("elemExcept", x => !(xs contains x))
def elemOf(xs: Elem*): Parser[Elem] = elem("elemOf", xs contains _)
def take(n: Int): Parser[Seq[Elem]] = repN(n, anyElem)
def takeUntil(cond: Parser[Elem]): Parser[Seq[Elem]] = takeUntil(cond, anyElem)
def takeUntil(cond: Parser[Elem], p: Parser[Elem]): Parser[Seq[Elem]] = rep(not(cond) ~> p)
def takeWhile(p: Parser[Elem]): Parser[Seq[Elem]] = rep(p)
}
case class ByteOffsetPosition(offset: Int) extends Position {
final val line = 1
def column = offset + 1
def lineContents: String = ""
}
class ByteReader(val bytes: Array[Byte], override val offset: Int) extends Reader[Byte] {
def this(reader: Reader[_]) = this(reader.source.toString.getBytes, 0)
def this(bytes: Seq[Byte]) = this(bytes.toArray, 0)
def this(str: String) = this(str.getBytes, 0)
override def source = bytes map (_.toChar)
def first: Byte = if (offset < bytes.length) bytes(offset) else EofCh.toByte
def rest: ByteReader = if (offset < bytes.length) new ByteReader(bytes, offset + 1) else this
def pos: Position = ByteOffsetPosition(offset)
def atEnd = offset >= bytes.length
def byteAt(n: Int) = bytes(n)
def length = bytes.length - offset
override def drop(n: Int): ByteReader = new ByteReader(bytes, offset + n)
def take(n: Int): Seq[Byte] = bytes drop offset take n
override def toString = "ByteReader(%d / %d)".format(offset, bytes.length)
}
trait BinaryParsers extends Parsers with ParsersUtil {
type Elem = Byte
protected implicit def readerToByteReader(x: Input): ByteReader = x match {
case br: ByteReader => br
case _ => new ByteReader(x)
}
def toInt(bytes: Seq[Byte]): Int = bytes.foldLeft(0)((x, b) => (x << 8) + (b & 0xFF))
def toLong(bytes: Seq[Byte]): Long = bytes.foldLeft(0L)((x, b) => (x << 8) + (b & 0xFF))
lazy val byte: Parser[Byte] = anyElem
lazy val u1: Parser[Int] = byte ^^ (_ & 0xFF)
lazy val u2: Parser[Int] = bytes(2) ^^ toInt
lazy val u4: Parser[Int] = bytes(4) ^^ toInt
lazy val u4f: Parser[Float] = u4 ^^ intBitsToFloat
lazy val u8: Parser[Long] = bytes(8) ^^ toLong
lazy val u8d: Parser[Double] = u8 ^^ longBitsToDouble
def bytes(n: Int): Parser[Seq[Byte]] = Parser { in =>
if (n <= in.length) Success(in take n, in drop n)
else Failure("Requested %d bytes but only %d remain".format(n, in.length), in)
}
def parse[T](p: Parser[T], in: Input): ParseResult[T] = p(in)
def parse[T](p: Parser[T], in: String): ParseResult[T] = parse(p, new ByteReader(in))
}
--
Paul Phillips | Before a man speaks it is always safe to assume
Everyman | that he is a fool. After he speaks, it is seldom
Empiricist | necessary to assume it.
slap pi uphill! | -- H. L. Mencken
Wed, 2010-03-17, 15:57
#6
Re: Parsing byte sequences
Sorry for commenting on an old thread, but BinaryParser looks really nice. Hope to see it in truck some day!
I guess final version will also have a way to specify little/big endianness, is that right?
Thanks,Mushtaq
On Tue, Dec 29, 2009 at 9:37 PM, Paul Phillips <paulp [at] improving [dot] org> wrote:
I guess final version will also have a way to specify little/big endianness, is that right?
Thanks,Mushtaq
On Tue, Dec 29, 2009 at 9:37 PM, Paul Phillips <paulp [at] improving [dot] org> wrote:
On Wed, Dec 30, 2009 at 12:36:07AM +1100, John Ky wrote:
> Anyone know how to use the parser combinators on byte arrays and
> InputStreams? I want to write a parser that will parse bytes rather
> than characters.
I happen to be working on a very entertaining project involving that at
this very moment. I'll publish this all pretty soon but here are a few
classes I use to make the standard library seem more byte oriented.
(Some of this is specific to my project.)
import scala.util.parsing.combinator._
import scala.util.parsing.input.{ Position, Reader }
import scala.util.parsing.input.CharArrayReader.EofCh
import scala.annotation.tailrec
import java.lang.Float.intBitsToFloat
import java.lang.Double.longBitsToDouble
trait ParsersUtil extends Parsers {
lazy val anyElem: Parser[Elem] = elem("anyElem", _ => true)
def elemExcept(xs: Elem*): Parser[Elem] = elem("elemExcept", x => !(xs contains x))
def elemOf(xs: Elem*): Parser[Elem] = elem("elemOf", xs contains _)
def take(n: Int): Parser[Seq[Elem]] = repN(n, anyElem)
def takeUntil(cond: Parser[Elem]): Parser[Seq[Elem]] = takeUntil(cond, anyElem)
def takeUntil(cond: Parser[Elem], p: Parser[Elem]): Parser[Seq[Elem]] = rep(not(cond) ~> p)
def takeWhile(p: Parser[Elem]): Parser[Seq[Elem]] = rep(p)
}
case class ByteOffsetPosition(offset: Int) extends Position {
final val line = 1
def column = offset + 1
def lineContents: String = ""
}
class ByteReader(val bytes: Array[Byte], override val offset: Int) extends Reader[Byte] {
def this(reader: Reader[_]) = this(reader.source.toString.getBytes, 0)
def this(bytes: Seq[Byte]) = this(bytes.toArray, 0)
def this(str: String) = this(str.getBytes, 0)
override def source = bytes map (_.toChar)
def first: Byte = if (offset < bytes.length) bytes(offset) else EofCh.toByte
def rest: ByteReader = if (offset < bytes.length) new ByteReader(bytes, offset + 1) else this
def pos: Position = ByteOffsetPosition(offset)
def atEnd = offset >= bytes.length
def byteAt(n: Int) = bytes(n)
def length = bytes.length - offset
override def drop(n: Int): ByteReader = new ByteReader(bytes, offset + n)
def take(n: Int): Seq[Byte] = bytes drop offset take n
override def toString = "ByteReader(%d / %d)".format(offset, bytes.length)
}
trait BinaryParsers extends Parsers with ParsersUtil {
type Elem = Byte
protected implicit def readerToByteReader(x: Input): ByteReader = x match {
case br: ByteReader => br
case _ => new ByteReader(x)
}
def toInt(bytes: Seq[Byte]): Int = bytes.foldLeft(0)((x, b) => (x << 8) + (b & 0xFF))
def toLong(bytes: Seq[Byte]): Long = bytes.foldLeft(0L)((x, b) => (x << 8) + (b & 0xFF))
lazy val byte: Parser[Byte] = anyElem
lazy val u1: Parser[Int] = byte ^^ (_ & 0xFF)
lazy val u2: Parser[Int] = bytes(2) ^^ toInt
lazy val u4: Parser[Int] = bytes(4) ^^ toInt
lazy val u4f: Parser[Float] = u4 ^^ intBitsToFloat
lazy val u8: Parser[Long] = bytes(8) ^^ toLong
lazy val u8d: Parser[Double] = u8 ^^ longBitsToDouble
def bytes(n: Int): Parser[Seq[Byte]] = Parser { in =>
if (n <= in.length) Success(in take n, in drop n)
else Failure("Requested %d bytes but only %d remain".format(n, in.length), in)
}
def parse[T](p: Parser[T], in: Input): ParseResult[T] = p(in)
def parse[T](p: Parser[T], in: String): ParseResult[T] = parse(p, new ByteReader(in))
}
--
Paul Phillips | Before a man speaks it is always safe to assume
Everyman | that he is a fool. After he speaks, it is seldom
Empiricist | necessary to assume it.
slap pi uphill! | -- H. L. Mencken
On Tue, Dec 29, 2009 at 11:36 AM, John Ky <newhoggy [at] gmail [dot] com> wrote:
--
Daniel C. Sobral
I travel to the future all the time.