org.xml.sax.InputSource Scala Examples
The following examples show how to use org.xml.sax.InputSource.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: PmmlLoaderKit.scala From flink-jpmml with GNU Affero General Public License v3.0 | 5 votes |
package io.radicalbit.flink.pmml.scala.utils import org.dmg.pmml.PMML import org.jpmml.model.{ImportFilter, JAXBUtil} import org.xml.sax.InputSource trait PmmlLoaderKit { protected case object Source { val KmeansPmml = "/kmeans.xml" val KmeansPmml41 = "/kmeans41.xml" val KmeansPmml40 = "/kmeans40.xml" val KmeansPmml42 = "/kmeans42.xml" val KmeansPmml32 = "/kmeans41.xml" val KmeansPmmlEmpty = "/kmeans_empty.xml" val KmeansPmmlNoOut = "/kmeans_nooutput.xml" val KmeansPmmlStringFields = "/kmeans_stringfields.xml" val KmeansPmmlNoOutNoTrg = "/kmeans_nooutput_notarget.xml" val NotExistingPath: String = "/not/existing/" + scala.util.Random.nextString(4) } final protected def getPMMLSource(path: String): String = getClass.getResource(path).getPath final protected def getPMMLResource(path: String): PMML = { val source = scala.io.Source.fromURL(getClass.getResource(path)).reader() JAXBUtil.unmarshalPMML(ImportFilter.apply(new InputSource(source))) } }
Example 2
Source File: AmqpXPathCheckMaterializer.scala From gatling-amqp-plugin with Apache License 2.0 | 5 votes |
package ru.tinkoff.gatling.amqp.checks import java.io.{ByteArrayInputStream, InputStreamReader} import io.gatling.commons.validation.{safely, _} import io.gatling.core.check.xpath.{Dom, XPathCheckType, XmlParsers} import io.gatling.core.check.{CheckMaterializer, Preparer} import org.xml.sax.InputSource import ru.tinkoff.gatling.amqp.AmqpCheck import ru.tinkoff.gatling.amqp.request.AmqpProtocolMessage class AmqpXPathCheckMaterializer(xmlParsers: XmlParsers) extends CheckMaterializer[XPathCheckType, AmqpCheck, AmqpProtocolMessage, Option[Dom]](identity) { private val ErrorMapper = "Could not parse response into a DOM Document: " + _ override protected def preparer: Preparer[AmqpProtocolMessage, Option[Dom]] = message => safely(ErrorMapper) { message match { case AmqpProtocolMessage(_, payload, _) => val in = new ByteArrayInputStream(payload) Some(xmlParsers.parse(new InputSource(new InputStreamReader(in)))).success case _ => "Unsupported message type".failure } } }
Example 3
Source File: EventDrivenFetcher.scala From Mycat-spider with Apache License 2.0 | 5 votes |
package turbo.crawler.power import java.io.IOException import java.io.StringReader import java.net.BindException import java.net.SocketException import java.net.SocketTimeoutException import org.apache.commons.httpclient.ConnectTimeoutException import org.apache.commons.httpclient.Header import org.cyberneko.html.parsers.DOMParser import org.w3c.dom.Document import org.xml.sax.InputSource import turbo.crawler.FetchRejectedException import turbo.crawler.Fetchable import turbo.crawler.IO import turbo.crawler.Logable import turbo.crawler.ResourceHasAlreadyBeenFetchedException import turbo.crawler.StringAdapter import turbo.crawler.io.HttpReturns /** * Event driven fetcher * @author mclaren * */ class EventDrivenFetcher[T <: Fetchable](eventId: String) extends Logable with MessageDriven with IO with StringAdapter { def fetch(fetchUrl: String , contentFilter: String => String , parseDocument: Document => List[T])(hasRejected: Document => Boolean) = { val _retry = (msg: String) => { logger.info("Retry " + msg) Thread.sleep(3000) this.fetch(fetchUrl, contentFilter, parseDocument)(hasRejected)(howToContinue)(referer) } var httpReturns: HttpReturns = null try { val dom = new DOMParser httpReturns = this.fromUrl(fetchUrl, Array[Header](new Header("Referer", referer(fetchUrl)))) dom.parse(new InputSource(new StringReader(contentFilter(httpReturns.body)))) var document = dom.getDocument //检查是否被屏蔽 if (hasRejected(document)) throw new FetchRejectedException(fetchUrl) parseDocument(document).foreach(x => fireEvent(new Evt(eventId + "_COMPLETION", x))) } catch { case e: SocketTimeoutException => _retry(e.getMessage) case e: SocketException => _retry(e.getMessage) case e: ConnectTimeoutException => _retry(e.getMessage) case e: IOException => { logger.info("Oh网络错误with代理:" + httpReturns.proxy.ip + ":" + httpReturns.proxy.port) howToContinue(fetchUrl, httpReturns.proxy) //10秒之内只允许出现一次重拨 _retry(e.getMessage) } case e: BindException => _retry(e.getMessage) case e: FetchRejectedException => { logger.info("Oh 惨遭屏蔽~") howToContinue(e.getFetchUrl, httpReturns.proxy) //10秒之内只允许出现一次重拨 _retry(e.getMessage) } case e: ResourceHasAlreadyBeenFetchedException => case e: Exception => { logger.error("Unknown exception has been occurred", e) } } } }
Example 4
Source File: Pagination.scala From Mycat-spider with Apache License 2.0 | 5 votes |
package turbo.crawler.power import java.io.IOException import java.io.StringReader import java.net.SocketException import java.net.SocketTimeoutException import org.apache.commons.httpclient.ConnectTimeoutException import org.apache.commons.httpclient.Header import org.cyberneko.html.parsers.DOMParser import org.w3c.dom.Document import org.xml.sax.InputSource import turbo.crawler.FetchRejectedException import turbo.crawler.Logable import turbo.crawler.ResourceHasAlreadyBeenFetchedException import turbo.crawler.io.HttpReturns import turbo.crawler.io.InternetIO /** * 分页支持 * @author mclaren * */ object pages extends Logable with InternetIO { def apply(fetchUrl: String, contentFilter: String => String, checkBoundary: Document => Int, urlFactory: (String, Int) => String)(hasRejected: Document => Boolean)(howToContinue: (String, turbo.crawler.io.Proxy) => Unit): List[String] = { var value = new ValueRef[Int](0) resetBoundary(fetchUrl, value, contentFilter, checkBoundary, urlFactory)(hasRejected)(howToContinue) var rts = List[String]() value.get for (i <- 1 to value.get) { rts = rts.+:(urlFactory(fetchUrl, i)) } rts } private def resetBoundary(fetchUrl: String, lastPage: ValueRef[Int], contentFilter: String => String = x => x, checkBoundary: Document => Int, urlFactory: (String, Int) => String)(hasRejected: Document => Boolean ): Unit = { val _retry = (() => { Thread.sleep(3000) resetBoundary(fetchUrl, lastPage, contentFilter, checkBoundary, urlFactory)(hasRejected)(howToContinue) }) var httpReturns: HttpReturns = null try { var domp = new DOMParser httpReturns = this.fromUrl(fetchUrl, Array[Header]()) domp.parse(new InputSource(new StringReader(contentFilter(httpReturns.body)))) var document = domp.getDocument if (hasRejected(document)) throw new FetchRejectedException(fetchUrl, httpReturns.proxy) lastPage.set(checkBoundary(document)) } catch { case e: SocketTimeoutException => _retry() case e: SocketException => _retry() case e: ConnectTimeoutException => _retry() case e: IOException => _retry() case e: FetchRejectedException => { logger.info("Oh 惨遭屏蔽~") howToContinue(e.getFetchUrl, httpReturns.proxy) _retry() } case e: ResourceHasAlreadyBeenFetchedException => case e: Exception => { logger.error("Unknown exception has been occurred", e) } } } } class ValueRef[M](v: M) { var value = v def set(vv: M) = this.value = vv def get = value }
Example 5
Source File: PMMLUtils.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.ml.util import java.io.StringReader import org.dmg.pmml._ import org.jpmml.model.{ImportFilter, JAXBUtil} import org.xml.sax.InputSource /** * Testing utils for working with PMML. * Predictive Model Markup Language (PMML) is an XML-based file format * developed by the Data Mining Group (www.dmg.org). */ object PMMLUtils { /** * :: Experimental :: * Load a PMML model from a string. Note: for testing only, PMML model evaluation is supported * through external spark-packages. */ def loadFromString(input: String): PMML = { val is = new StringReader(input) val transformed = ImportFilter.apply(new InputSource(is)) JAXBUtil.unmarshalPMML(transformed) } }