Alternatives and detailed information of nest-crawler

😎 nest-crawler 😎

Crawler and Scraper Module for NestJS

Installation

$ npm install --save nest-crawler

Usage

First, register it in the application module so that Nest can handle dependencies:

import { Module } from '@nestjs/common';
import { NestCrawlerModule } from 'nest-crawler';

@Module({
  imports: [
    NestCrawlerModule,
  ],
})
export class AppModule {}

Then, just import it and use it:

crawler.module.ts

import { Module } from '@nestjs/common';
import { NestCrawlerModule } from 'nest-crawler';
@Module({
  imports: [
    NestCrawlerModule,
  ],
})
export class CrawlerModule {}

crawler.service.ts

import { Injectable } from '@nestjs/common';
import { NestCrawlerService } from 'nest-crawler';

@Injectable()
export class CrawlerService {
  constructor(
    private readonly crawler: NestCrawlerService,
  ) {}

  // scraping the specific page
  public async scrape(): Promise<void> {
    interface ExampleCom {
      title: string;
      info: string;
      content: string;
    }

    const data: ExampleCom = await this.crawler.fetch({
      target: 'http://example.com',
      fetch: {
        title: 'h1',
        info: {
          selector: 'p > a',
          attr: 'href',
        },
        content: {
          selector: '.content',
          how: 'html',
        },
      },
    });

    console.log(data);
    // {
    //   title: 'Example Domain',
    //   info: 'http://www.iana.org/domains/example',
    //   content: '<div><h1>Example Heading</h1><p>Example Paragraph</p></div>'
    // }
  }

  // crawling multi pages is also supported
  public async crawl(): Promise<void> {
    interface HackerNewsPage {
      title: string;
    }

    const pages: HackerNewsPage[] = await this.crawler.fetch({
      target: {
        url: 'https://news.ycombinator.com',
        iterator: {
          selector: 'span.age > a',
          convert: (x: string) => `https://news.ycombinator.com/${x}`,
        },
      },
      fetch: (data: any, index: number, url: string) => ({
        title: '.title > a',
      }),
    });

    console.log(pages);
    // [
    //   { title: 'Post Title 1' },
    //   { title: 'Post Title 2' },
    //   ...
    //   ...
    //   { title: 'Post Title 30' }
    // ]
  }
}

Recipe

Single Page Scraping

import { Injectable } from '@nestjs/common';
import { NestCrawlerService } from 'nest-crawler';

@Injectable()
export class CrawlerService {
  constructor(
    private readonly crawler: NestCrawlerService,
  ) {}

  public async scrape(): Promise<void> {
    interface ExampleCom {
      title: string;
      info: string;
      content: string;
    }

    const data: ExampleCom = await this.crawler.fetch({
      target: 'http://example.com',
      fetch: {
        title: 'h1',
        info: {
          selector: 'p > a',
          attr: 'href',
        },
        content: {
          selector: '.content',
          how: 'html',
        }
      },
    });

    console.log(data);
    // {
    //   title: 'Example Domain',
    //   info: 'http://www.iana.org/domains/example',
    //   content: '<div><h1>Example Heading</h1><p>Example Paragraph</p></div>'
    // }
  }
}

Multi Pages Crawling

You Know the target urls already

import { Injectable } from '@nestjs/common';
import { NestCrawlerService } from 'nest-crawler';

@Injectable()
export class CrawlerService {
  constructor(
    private readonly crawler: NestCrawlerService,
  ) {}

  public async crawl(): Promise<void> {
    interface Site {
      title: string;
    }

    const sites: Site[] = await this.crawler.fetch({
      target: [
        'https://example1.com',
        'https://example2.com',
        'https://example3.com',
      ],
      fetch: (data: any, index: number, url: string) => ({
        title: 'h1',
      }),
    });

    console.log(sites);
    // [
    //   { title: 'An easiest crawling and scraping module for NestJS' },
    //   { title: 'A minimalistic boilerplate on top of Webpack, Babel, TypeScript and React' },
    //   { title: '[Experimental] React SSR as a view template engine' }
    // ]
  }
}

You Don't Know the Target Urls so Want to Crawl Dynamically

import { Injectable } from '@nestjs/common';
import { NestCrawlerService } from 'nest-crawler';

@Injectable()
export class CrawlerService {
  constructor(
    private readonly crawler: NestCrawlerService,
  ) {}

  public async crawl(): Promise<void> {
    interface Page {
      title: string;
    }

    const pages: Page[] = await this.crawler.fetch({
      target: {
        url: 'https://news.ycombinator.com',
        iterator: {
          selector: 'span.age > a',
          convert: (x: string) => `https://news.ycombinator.com/${x}`,
        },
      },
      // fetch each `https://news.ycombinator.com/${x}` and scrape data
      fetch: (data: any, index: number, url: string) => ({
        title: '.title > a',
      }),
    });

    console.log(pages);
    // [
    //   { title: 'Post Title 1' },
    //   { title: 'Post Title 2' },
    //   ...
    //   ...
    //   { title: 'Post Title 30' }
    // ]
  }
}

You Need to Pass Data Dynamically

import { Injectable } from '@nestjs/common';
import { NestCrawlerService } from 'nest-crawler';

@Injectable()
export class CrawlerService {
  constructor(
    private readonly crawler: NestCrawlerService,
  ) {}

  public async crawl(): Promise<void> {
    interface Img {
      src: string;
    }

    const images: Img[] = await this.crawler.fetch({
      target: {
        url: 'https://some.image.com',
        iterator: {
          selector: 'span.age > a',
          convert: (x: string) => `https://some.image.com${x}`,
        },
        fetch: {
          imageIds: {
            listItem: 'div.image',
            data: {
              id: {
                selector: 'div.image-wrapper',
                attr: 'data-image-id',
              },
            },
          },
        },
      },
      // fetch each `https://some.image.com${x}`, pass data and scrape data
      fetch: (data: any, index: number, url: string) => ({
        src: {
          convert: () => `https://some.image.com/images/${data.imageIds[index]}.png`,
        },
      }),
    });

    console.log(images);
    // [
    //   { src: 'https://some.image.com/images/1.png' },
    //   { src: 'https://some.image.com/images/2.png' },
    //   ...
    //   ...
    //   { src: 'https://some.image.com/images/100.png' }
    // ]
  }
}

Waitable (by using `puppeteer`)

import { Injectable } from '@nestjs/common';
import { NestCrawlerService } from 'nest-crawler';

@Injectable()
export class CrawlerService {
  constructor(
    private readonly crawler: NestCrawlerService,
  ) {}

  public async crawl(): Promise<void> {
    interface Page {
      title: string;
    }

    const pages: Page[] = await this.crawler.fetch({
      target: {
        url: 'https://news.ycombinator.com',
        iterator: {
          selector: 'span.age > a',
          convert: (x: string) => `https://news.ycombinator.com/${x}`,
        },
      },
      waitFor: 3 * 1000, // wait for the content loaded! (like single page apps)
      fetch: (data: any, index: number, url: string) => ({
        title: '.title > a',
      }),
    });

    console.log(pages);
    // [
    //   { title: 'Post Title 1' },
    //   { title: 'Post Title 2' },
    //   ...
    //   ...
    //   { title: 'Post Title 30' }
    // ]
  }
}

@web-master/node-web-fetch

Cheap and reliable Node.js hosting starts at $3/month, and $1/month static HTML hosting

saltyshiomix / nest-crawler

Programming Languages

Labels

Projects that are alternatives of or similar to nest-crawler

Installation

Usage

Recipe

Single Page Scraping

Multi Pages Crawling

You Know the target urls already

You Don't Know the Target Urls so Want to Crawl Dynamically

You Need to Pass Data Dynamically

Waitable (by using `puppeteer`)

Related

Cheap and reliable Node.js hosting starts at $3/month, and $1/month static HTML hosting

saltyshiomix / nest-crawler

Programming Languages

Labels

Projects that are alternatives of or similar to nest-crawler

Installation

Usage

Recipe

Single Page Scraping

Multi Pages Crawling

You Know the target urls already

You Don't Know the Target Urls so Want to Crawl Dynamically

You Need to Pass Data Dynamically

Waitable (by using puppeteer)

Related

Waitable (by using `puppeteer`)