snippets/misc/scrape.scala
2024-11-02 20:34:57 +08:00

97 lines
3.2 KiB
Scala

//> using dep org.http4s::http4s-ember-client:1.0.0-M43
//> using dep org.typelevel::log4cats-slf4j:2.7.0
import cats.effect.*
import cats.effect.std.*
import cats.syntax.all.*
import org.http4s.ember.client.EmberClientBuilder
import org.http4s.client.Client
import org.http4s.client.middleware.FollowRedirect
import org.http4s.EntityDecoder
import fs2.*
import fs2.io.file.*
import org.typelevel.log4cats.*
import org.typelevel.log4cats.slf4j.Slf4jFactory
object Scrape extends IOApp.Simple {
def run = {
saveImages().compile.drain.as(ExitCode.Success)
}
given LoggerFactory[IO] = Slf4jFactory.create[IO]
private def saveImages() = {
for {
client <- Stream.resource(EmberClientBuilder .default[IO] .build)
ref <- Stream.eval(Ref.of[IO, Set[String]](Set.empty))
scanedRef <- Stream.eval(Ref.of[IO, Set[String]](Set.empty))
_ <- loopScrape(scanedRef, client, "http://help.drip.im/hc", 0)
.evalMap(i => downloadImage(ref, client, i).handleErrorWith(e => IO.println(s"Error download ${i}, ${e}")))
} yield ()
}
def normalizeFilePath(p: String) = {
val regex = "/\\?[^/]*$"
p.replaceAll(regex, "")
}
private def downloadImage(savedUrls: Ref[IO, Set[String]], client: Client[IO], url: String): IO[Unit] = {
val FpRegex = "https?\\://[^/]+/(.*)$".r
val fClient = FollowRedirect(maxRedirects = 3)(client)
val path = url match {
case FpRegex(p) => normalizeFilePath(p)
}
val fp = Path(s"/Users/jilen/Downloads/drip_help_img/${path}")
for {
surls <- savedUrls.get
saved = surls.contains(url)
_ <- if(saved) IO.println(s":::${url} Already saved") else IO.println(s":::${url} saved to ${fp}")
_ <- fp.parent.map(Files[IO].createDirectories).getOrElse(IO.pure(())).whenA(!saved)
_ <- fClient.expect(url)(using EntityDecoder.binFile[IO](fp)).whenA(!saved)
_ <- savedUrls.update(_ + url).whenA(!saved)
} yield ()
}
private def loopScrape(scanedRef: Ref[IO, Set[String]],
client: Client[IO], rootUrl: String, depth: Int): Stream[IO, String] = {
Stream.eval(scanedRef.get).flatMap { scaned =>
if(scaned.contains(rootUrl)) {
Stream.empty
} else {
Stream.eval(scanedRef.update(_ + rootUrl))
>> Stream.eval(client.expect[String](rootUrl).handleError(o => "")).flatMap { c =>
val images = extractImages(c)
val remain = if(depth >= 2) {
Stream.empty
} else {
Stream.emits(extractLinks(c)).flatMap(l => loopScrape(scanedRef, client, l, depth + 1))
}
Stream.emits(images) ++ remain
}
}
}
}
private def extractImages(page: String): Seq[String] = {
val regex = "img\\s*src\\s*=\\s*\"(\\S+)\"".r
regex.findAllMatchIn(page).map { (m) =>
m.group(1)
}.toSeq.filterNot(s => s.contains("wx-static.drip.im") || s.contains("qiniu.drip.im"))
}
private def extractLinks(page: String): Seq[String] = {
val regex = "href[^\\>]*(?:help\\.drip\\.im)?(/hc/[\\w/]+)".r
regex.findAllMatchIn(page).map { (m) =>
val l = m.group(1)
val o = if(!l.startsWith("http")) s"http://help.drip.im$l" else l
o
}.toSeq.filter(_.contains("help.drip.im"))
}
}