From cc9255ae51d9115fb60e939b68045d675bf94adf Mon Sep 17 00:00:00 2001 From: jilen Date: Sat, 2 Nov 2024 20:34:57 +0800 Subject: [PATCH] init snippets --- .gitignore | 2 + misc/scrape.scala | 97 +++++++++++++++++++++++++++++++++++++++++++ std/macro.scala | 21 ++++++++++ std/newInstance.scala | 3 ++ zd/gen_user_id.scala | 14 +++++++ 5 files changed, 137 insertions(+) create mode 100644 .gitignore create mode 100644 misc/scrape.scala create mode 100644 std/macro.scala create mode 100644 std/newInstance.scala create mode 100644 zd/gen_user_id.scala diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0357b3d --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.scala-build/ +.bsp/ \ No newline at end of file diff --git a/misc/scrape.scala b/misc/scrape.scala new file mode 100644 index 0000000..a562c8d --- /dev/null +++ b/misc/scrape.scala @@ -0,0 +1,97 @@ +//> using dep org.http4s::http4s-ember-client:1.0.0-M43 +//> using dep org.typelevel::log4cats-slf4j:2.7.0 +import cats.effect.* +import cats.effect.std.* +import cats.syntax.all.* +import org.http4s.ember.client.EmberClientBuilder +import org.http4s.client.Client +import org.http4s.client.middleware.FollowRedirect +import org.http4s.EntityDecoder +import fs2.* +import fs2.io.file.* +import org.typelevel.log4cats.* +import org.typelevel.log4cats.slf4j.Slf4jFactory + +object Scrape extends IOApp.Simple { + def run = { + saveImages().compile.drain.as(ExitCode.Success) + } + + given LoggerFactory[IO] = Slf4jFactory.create[IO] + + private def saveImages() = { + for { + client <- Stream.resource(EmberClientBuilder .default[IO] .build) + ref <- Stream.eval(Ref.of[IO, Set[String]](Set.empty)) + scanedRef <- Stream.eval(Ref.of[IO, Set[String]](Set.empty)) + _ <- loopScrape(scanedRef, client, "http://help.drip.im/hc", 0) + .evalMap(i => downloadImage(ref, client, i).handleErrorWith(e => IO.println(s"Error download ${i}, ${e}"))) + } yield () + + } + + def normalizeFilePath(p: String) = { + val regex = "/\\?[^/]*$" + p.replaceAll(regex, "") + } + + private def downloadImage(savedUrls: Ref[IO, Set[String]], client: Client[IO], url: String): IO[Unit] = { + val FpRegex = "https?\\://[^/]+/(.*)$".r + val fClient = FollowRedirect(maxRedirects = 3)(client) + val path = url match { + case FpRegex(p) => normalizeFilePath(p) + } + val fp = Path(s"/Users/jilen/Downloads/drip_help_img/${path}") + for { + surls <- savedUrls.get + saved = surls.contains(url) + _ <- if(saved) IO.println(s":::${url} Already saved") else IO.println(s":::${url} saved to ${fp}") + _ <- fp.parent.map(Files[IO].createDirectories).getOrElse(IO.pure(())).whenA(!saved) + _ <- fClient.expect(url)(using EntityDecoder.binFile[IO](fp)).whenA(!saved) + _ <- savedUrls.update(_ + url).whenA(!saved) + } yield () + } + + private def loopScrape(scanedRef: Ref[IO, Set[String]], + client: Client[IO], rootUrl: String, depth: Int): Stream[IO, String] = { + + Stream.eval(scanedRef.get).flatMap { scaned => + if(scaned.contains(rootUrl)) { + Stream.empty + } else { + Stream.eval(scanedRef.update(_ + rootUrl)) + >> Stream.eval(client.expect[String](rootUrl).handleError(o => "")).flatMap { c => + val images = extractImages(c) + val remain = if(depth >= 2) { + Stream.empty + } else { + Stream.emits(extractLinks(c)).flatMap(l => loopScrape(scanedRef, client, l, depth + 1)) + } + Stream.emits(images) ++ remain + } + } + } + + + + } + + + + private def extractImages(page: String): Seq[String] = { + + val regex = "img\\s*src\\s*=\\s*\"(\\S+)\"".r + regex.findAllMatchIn(page).map { (m) => + m.group(1) + }.toSeq.filterNot(s => s.contains("wx-static.drip.im") || s.contains("qiniu.drip.im")) + } + + private def extractLinks(page: String): Seq[String] = { + val regex = "href[^\\>]*(?:help\\.drip\\.im)?(/hc/[\\w/]+)".r + regex.findAllMatchIn(page).map { (m) => + val l = m.group(1) + val o = if(!l.startsWith("http")) s"http://help.drip.im$l" else l + o + }.toSeq.filter(_.contains("help.drip.im")) + } +} diff --git a/std/macro.scala b/std/macro.scala new file mode 100644 index 0000000..c45b24e --- /dev/null +++ b/std/macro.scala @@ -0,0 +1,21 @@ +//> using scala 3.4.0 + +import scala.quoted.* + +inline def printTree[A](inline x: A) = ${printTreeImpl[A]('x)} + +private def printTreeImpl[A](x: Expr[A])(using Quotes) = { + import quotes.reflect.* + def print0(t: Term): Unit = { + t match { + case Inlined(_, _, n) => + print0(n) + case Lambda(vals, body) => + println(s"Lambda body ${body}") + case o => + println(s"Lambda body ${o}") + } + } + print0(x.asTerm) + '{()} +} diff --git a/std/newInstance.scala b/std/newInstance.scala new file mode 100644 index 0000000..8417713 --- /dev/null +++ b/std/newInstance.scala @@ -0,0 +1,3 @@ +//> using scala 3.3.4 + + diff --git a/zd/gen_user_id.scala b/zd/gen_user_id.scala new file mode 100644 index 0000000..b864509 --- /dev/null +++ b/zd/gen_user_id.scala @@ -0,0 +1,14 @@ +//> using dep com.google.guava:guava:33.3.0-jre + +import com.google.common.base.Charsets +import com.google.common.hash.Hashing + +@main +def genId(id: String, key: String) = { + val sign = sha256(id + sha256(key)) + println(s"${id}|${sign}") +} + +def sha256(str: String) = { + Hashing.sha256().hashString(str, Charsets.UTF_8).toString +} \ No newline at end of file