Skip to content

Releases: Mediashare/Spider

Symfony 5 Migration

21 Dec 04:34
Compare
Choose a tag to compare
0.1.0

Migration

0.0.2

21 Dec 03:09
Compare
Choose a tag to compare

I not found how to bypass symfony project now ...

Stable version

21 Dec 02:45
Compare
Choose a tag to compare

Packagist Version

Installation

composer require mediashare/spider

Usage

Create index.php file and init the config.

<?php
// ./index.php
require 'vendor/autoload.php';

// Website Config
$config = new \Mediashare\Entity\Config();
$config->setWebspider(true); // Crawl all website
$config->setReportsDir(__DIR__.'/reports/'); // Default reports path
$config->setModulesDir(__DIR__.'/modules/'); // Default modules path
// Prompt Console / Dump
$config->setVerbose(true); // Prompt verbose output
$config->setJson(false); // Prompt json output
// Modules Activation
$config->enableAllModule(true); // Enable all modules
// Modules Activation
$config->enableAllModule(true); // Enable all modules
// $config->addModules(['Links', 'Search']);// Select one or more modules to use with class name

// Url
$url = new \Mediashare\Entity\Url('http://marquand.pro');
// Spider
$spider = new \Mediashare\Spider($url, $config);
$result = $spider->run();
// dump($result);

Create own module to execute actions when the crawler scraps a webpage.

// ./modules/Links.php
<?php
namespace Mediashare\Modules;

class Links {
    public $name = "Links";
    public $description = "Get all links in webpage";
    public $config; // Spider Config
    public $url; // Url with Headers & Body
    public $crawler; // Dom for crawl in webpage
    public $variables = "0"; // Variables injected
    public $errors; // Output errors
    
    public function run() { 
        $source = $this->webpage->getUrl();
        $links = [];
        foreach($this->dom->filter('a') as $link) {
            if (!empty($link)) {
                $href = rtrim(ltrim($link->getAttribute('href')));
                if ($href) {
                    if (isset($links[$href])) {
                        $links[$href]++;
                    } else {
                        $links[$href] = 1;
                    }
                }
            }
        }
        return $links;
    }
}

Execute the code from the console.

php index.php

Output

-*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*
* Output file result: /home/slote/Bureau/Spider/var/reports/marquand.pro/5dfaf1c0147c6.json
-*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*