org.xml.sax.InputSource Scala Example

Source File: PmmlLoaderKit.scala From flink-jpmml with GNU Affero General Public License v3.0

5 votes

package io.radicalbit.flink.pmml.scala.utils

import org.dmg.pmml.PMML
import org.jpmml.model.{ImportFilter, JAXBUtil}
import org.xml.sax.InputSource

trait PmmlLoaderKit {

  protected case object Source {
    val KmeansPmml = "/kmeans.xml"
    val KmeansPmml41 = "/kmeans41.xml"
    val KmeansPmml40 = "/kmeans40.xml"
    val KmeansPmml42 = "/kmeans42.xml"
    val KmeansPmml32 = "/kmeans41.xml"

    val KmeansPmmlEmpty = "/kmeans_empty.xml"
    val KmeansPmmlNoOut = "/kmeans_nooutput.xml"
    val KmeansPmmlStringFields = "/kmeans_stringfields.xml"
    val KmeansPmmlNoOutNoTrg = "/kmeans_nooutput_notarget.xml"
    val NotExistingPath: String = "/not/existing/" + scala.util.Random.nextString(4)
  }

  final protected def getPMMLSource(path: String): String =
    getClass.getResource(path).getPath

  final protected def getPMMLResource(path: String): PMML = {
    val source = scala.io.Source.fromURL(getClass.getResource(path)).reader()
    JAXBUtil.unmarshalPMML(ImportFilter.apply(new InputSource(source)))
  }

}

Source File: AmqpXPathCheckMaterializer.scala From gatling-amqp-plugin with Apache License 2.0

5 votes

package ru.tinkoff.gatling.amqp.checks

import java.io.{ByteArrayInputStream, InputStreamReader}

import io.gatling.commons.validation.{safely, _}
import io.gatling.core.check.xpath.{Dom, XPathCheckType, XmlParsers}
import io.gatling.core.check.{CheckMaterializer, Preparer}
import org.xml.sax.InputSource
import ru.tinkoff.gatling.amqp.AmqpCheck
import ru.tinkoff.gatling.amqp.request.AmqpProtocolMessage

class AmqpXPathCheckMaterializer(xmlParsers: XmlParsers)
    extends CheckMaterializer[XPathCheckType, AmqpCheck, AmqpProtocolMessage, Option[Dom]](identity) {
  private val ErrorMapper = "Could not parse response into a DOM Document: " + _

  override protected def preparer: Preparer[AmqpProtocolMessage, Option[Dom]] =
    message =>
      safely(ErrorMapper) {
        message match {
          case AmqpProtocolMessage(_, payload, _) =>
            val in = new ByteArrayInputStream(payload)
            Some(xmlParsers.parse(new InputSource(new InputStreamReader(in)))).success
          case _ => "Unsupported message type".failure
        }
      }
}

Source File: EventDrivenFetcher.scala From Mycat-spider with Apache License 2.0

5 votes

package turbo.crawler.power

import java.io.IOException
import java.io.StringReader
import java.net.BindException
import java.net.SocketException
import java.net.SocketTimeoutException

import org.apache.commons.httpclient.ConnectTimeoutException
import org.apache.commons.httpclient.Header
import org.cyberneko.html.parsers.DOMParser
import org.w3c.dom.Document
import org.xml.sax.InputSource

import turbo.crawler.FetchRejectedException
import turbo.crawler.Fetchable
import turbo.crawler.IO
import turbo.crawler.Logable
import turbo.crawler.ResourceHasAlreadyBeenFetchedException
import turbo.crawler.StringAdapter
import turbo.crawler.io.HttpReturns

/**
 * Event driven fetcher
 * @author mclaren
 *
 */
class EventDrivenFetcher[T <: Fetchable](eventId: String) extends Logable with MessageDriven with IO with StringAdapter {
  def fetch(fetchUrl: String  ,
    contentFilter: String => String  ,
    parseDocument: Document => List[T])(hasRejected: Document => Boolean)  = {
    val _retry = (msg: String) => {
      logger.info("Retry " + msg)
      Thread.sleep(3000)
      this.fetch(fetchUrl, contentFilter, parseDocument)(hasRejected)(howToContinue)(referer)
    }
    var httpReturns: HttpReturns = null
    try {
      val dom = new DOMParser
      httpReturns = this.fromUrl(fetchUrl, Array[Header](new Header("Referer", referer(fetchUrl))))
      dom.parse(new InputSource(new StringReader(contentFilter(httpReturns.body))))
      var document = dom.getDocument

      //检查是否被屏蔽
      if (hasRejected(document)) throw new FetchRejectedException(fetchUrl)

      parseDocument(document).foreach(x => fireEvent(new Evt(eventId + "_COMPLETION", x)))

    } catch {
      case e: SocketTimeoutException => _retry(e.getMessage)
      case e: SocketException => _retry(e.getMessage)
      case e: ConnectTimeoutException => _retry(e.getMessage)
      case e: IOException => {
        logger.info("Oh网络错误with代理：" + httpReturns.proxy.ip + ":" + httpReturns.proxy.port)
        howToContinue(fetchUrl, httpReturns.proxy)
        //10秒之内只允许出现一次重拨
        _retry(e.getMessage)
      }
      case e: BindException => _retry(e.getMessage)
      case e: FetchRejectedException => {
        logger.info("Oh 惨遭屏蔽~")
        howToContinue(e.getFetchUrl, httpReturns.proxy)
        //10秒之内只允许出现一次重拨
        _retry(e.getMessage)
      }
      case e: ResourceHasAlreadyBeenFetchedException =>
      case e: Exception => {
        logger.error("Unknown exception has been occurred", e)
      }
    }
  }
}

Source File: Pagination.scala From Mycat-spider with Apache License 2.0

5 votes

package turbo.crawler.power

import java.io.IOException
import java.io.StringReader
import java.net.SocketException
import java.net.SocketTimeoutException

import org.apache.commons.httpclient.ConnectTimeoutException
import org.apache.commons.httpclient.Header
import org.cyberneko.html.parsers.DOMParser
import org.w3c.dom.Document
import org.xml.sax.InputSource

import turbo.crawler.FetchRejectedException
import turbo.crawler.Logable
import turbo.crawler.ResourceHasAlreadyBeenFetchedException
import turbo.crawler.io.HttpReturns
import turbo.crawler.io.InternetIO

/**
 * 分页支持
 * @author mclaren
 *
 */
object pages extends Logable with InternetIO {
  def apply(fetchUrl: String, contentFilter: String => String, checkBoundary: Document => Int, urlFactory: (String, Int) => String)(hasRejected: Document => Boolean)(howToContinue: (String, turbo.crawler.io.Proxy) => Unit): List[String] = {
    var value = new ValueRef[Int](0)
    resetBoundary(fetchUrl, value, contentFilter, checkBoundary, urlFactory)(hasRejected)(howToContinue)
    var rts = List[String]()
    value.get
    for (i <- 1 to value.get) {
      rts = rts.+:(urlFactory(fetchUrl, i))
    }
    rts
  }

  private def resetBoundary(fetchUrl: String, lastPage: ValueRef[Int], contentFilter: String => String = x => x, checkBoundary: Document => Int, urlFactory: (String, Int) => String)(hasRejected: Document => Boolean  ): Unit = {
    val _retry = (() => {
      Thread.sleep(3000)
      resetBoundary(fetchUrl, lastPage, contentFilter, checkBoundary, urlFactory)(hasRejected)(howToContinue)
    })

    var httpReturns: HttpReturns = null
    try {
      var domp = new DOMParser
      httpReturns = this.fromUrl(fetchUrl, Array[Header]())
      domp.parse(new InputSource(new StringReader(contentFilter(httpReturns.body))))
      var document = domp.getDocument

      if (hasRejected(document)) throw new FetchRejectedException(fetchUrl, httpReturns.proxy)

      lastPage.set(checkBoundary(document))
    } catch {
      case e: SocketTimeoutException => _retry()
      case e: SocketException => _retry()
      case e: ConnectTimeoutException => _retry()
      case e: IOException => _retry()
      case e: FetchRejectedException => {
        logger.info("Oh 惨遭屏蔽~")
        howToContinue(e.getFetchUrl, httpReturns.proxy)
        _retry()
      }
      case e: ResourceHasAlreadyBeenFetchedException =>
      case e: Exception => {
        logger.error("Unknown exception has been occurred", e)
      }
    }
  }
}
class ValueRef[M](v: M) {
  var value = v
  def set(vv: M) = this.value = vv
  def get = value
}

Source File: PMMLUtils.scala From sona with Apache License 2.0

5 votes

package com.tencent.angel.sona.ml.util

import java.io.StringReader

import org.dmg.pmml._
import org.jpmml.model.{ImportFilter, JAXBUtil}
import org.xml.sax.InputSource

/**
 * Testing utils for working with PMML.
 * Predictive Model Markup Language (PMML) is an XML-based file format
 * developed by the Data Mining Group (www.dmg.org).
 */
object PMMLUtils {
  /**
   * :: Experimental ::
   * Load a PMML model from a string. Note: for testing only, PMML model evaluation is supported
   * through external spark-packages.
   */
  def loadFromString(input: String): PMML = {
    val is = new StringReader(input)
    val transformed = ImportFilter.apply(new InputSource(is))
    JAXBUtil.unmarshalPMML(transformed)
  }
}

org.xml.sax.InputSource Scala Examples