f776b5bd09425a7a15a0bf9e3bea8d995e70281e,modules/Gyoi_SpiderControl.py,SpiderControl,run_spider,#SpiderControl#Any#Any#Any#Any#,44

Before Change



    // Running spider.
    def run_spider(self, protocol, target_fqdn, target_port, target_path):
        self.utility.write_log(20, "[In] Run spider [{}].".format(self.file_name))

        // Original log of GyoiThon.
        now_time = self.utility.get_current_date("%Y%m%d%H%M%S")

After Change



    // Running spider.
    def run_spider(self, protocol, target_fqdn, target_port, target_path):
        msg = self.utility.make_log_msg(self.utility.log_in,
                                        self.utility.log_dis,
                                        self.file_name,
                                        action=self.action_name,
                                        note="Run spider",
                                        dest=self.utility.target_host)
        self.utility.write_log(20, msg)

        // Original log of GyoiThon.
        now_time = self.utility.get_current_date("%Y%m%d%H%M%S")
        gyoithon_log = protocol + "_" + target_fqdn + "_" + target_port + "_crawl_response_" + now_time + ".log"
        target_dir_name = target_fqdn + "_" + str(target_port)
        base_log_path = os.path.join("logs", target_dir_name)
        if os.path.exists(base_log_path) is False:
            os.mkdir(base_log_path)
        gyoithon_log_path = os.path.join(base_log_path, gyoithon_log)

        // Default log of Scrapy (Required: Relative path).
        scrapy_log_path = os.path.join(base_log_path, now_time + self.output_filename)

        // Assemble command options.
        target_url = protocol + "://" + target_fqdn + ":" + target_port + target_path

        // Proxy setting.
        proxy = ""
        if self.utility.proxy != "":
            parsed = util.parse_url(self.utility.proxy)
            if self.utility.proxy_user != "":
                proxy = parsed.scheme + "://" + \
                        self.utility.proxy_user + ":" + self.utility.proxy_pass + "@" + \
                        parsed.netloc
            else:
                proxy = parsed.scheme + "://" + parsed.netloc

        // Assemble Scrapy command.
        option = " -a target_url=" + target_url + " -a allow_domain=" + target_fqdn + \
                 " -a concurrent=" + self.spider_concurrent_reqs + " -a depth_limit=" + self.spider_depth_limit + \
                 " -a delay=" + self.spider_delay_time + " -a store_path=" + gyoithon_log_path + \
                 " -a proxy_server=" + proxy + " -a user_agent="" + self.utility.ua + """\
                 " -a encoding=" + self.utility.encoding + " -o " + scrapy_log_path
        close_opton = " -s CLOSESPIDER_TIMEOUT=" + self.spider_time_out + \
                      " -s CLOSESPIDER_ITEMCOUNT=" + self.spider_item_count + \
                      " -s CLOSESPIDER_PAGECOUNT=" + self.spider_page_count + \
                      " -s CLOSESPIDER_ERRORCOUNT=" + self.spider_error_count + " "
        spider_path = os.path.join(self.full_path, "Gyoi_Spider.py")
        command = "scrapy runspider" + close_opton + spider_path + option
        msg = "Execute spider : {}.".format(command)
        self.utility.print_message(OK, msg)
        msg = self.utility.make_log_msg(self.utility.log_mid,
                                        self.utility.log_dis,
                                        self.file_name,
                                        action=self.action_name,
                                        note=msg,
                                        dest=self.utility.target_host)
        self.utility.write_log(20, msg)

        // Execute Scrapy.
        proc = Popen(command, shell=True)
        proc.wait()

        // Get crawling result.
        all_targets_log = []
        target_log = [target_url]
        non_target_log = []
        dict_json = {}
        if os.path.exists(scrapy_log_path):
            with codecs.open(scrapy_log_path, "r", encoding="utf-8") as fin:
                target_text = self.utility.delete_ctrl_char(fin.read())
                if target_text != "":
                    dict_json = json.loads(target_text)
                else:
                    self.utility.print_message(WARNING, "[{}] is empty.".format(scrapy_log_path))

        // Exclude except allowed domains.
        for idx in range(len(dict_json)):
            items = dict_json[idx]["urls"]
            for item in items:
                try:
                    if target_fqdn == util.parse_url(item).host:
                        target_log.append(item)
                    elif util.parse_url(item).scheme in ["http", "https"]:
                        non_target_log.append(item)
                except Exception as e:
                    msg = "Excepting allowed domain is failure : {}".format(e)
                    self.utility.print_message(FAIL, msg)
                    self.utility.write_log(30, msg)

        self.utility.write_log(20, "Get spider result.")
        all_targets_log.append([target_url, gyoithon_log_path, list(set(target_log))])

        msg = self.utility.make_log_msg(self.utility.log_out,
                                        self.utility.log_dis,
                                        self.file_name,
                                        action=self.action_name,
                                        note="Run spider",
                                        dest=self.utility.target_host)
        self.utility.write_log(20, msg)
        return all_targets_log, non_target_log
Italian Trulli
In pattern: SUPERPATTERN

Frequency: 6

Non-data size: 34

Instances


Project Name: gyoisamurai/GyoiThon
Commit Name: f776b5bd09425a7a15a0bf9e3bea8d995e70281e
Time: 2019-05-21
Author: gyoiler3@gmail.com
File Name: modules/Gyoi_SpiderControl.py
Class Name: SpiderControl
Method Name: run_spider


Project Name: gyoisamurai/GyoiThon
Commit Name: f776b5bd09425a7a15a0bf9e3bea8d995e70281e
Time: 2019-05-21
Author: gyoiler3@gmail.com
File Name: modules/Gyoi_VersionCheckerML.py
Class Name: VersionCheckerML
Method Name: identify_product


Project Name: gyoisamurai/GyoiThon
Commit Name: f776b5bd09425a7a15a0bf9e3bea8d995e70281e
Time: 2019-05-21
Author: gyoiler3@gmail.com
File Name: modules/Gyoi_VersionChecker.py
Class Name: VersionChecker
Method Name: identify_product


Project Name: gyoisamurai/GyoiThon
Commit Name: f776b5bd09425a7a15a0bf9e3bea8d995e70281e
Time: 2019-05-21
Author: gyoiler3@gmail.com
File Name: modules/Gyoi_CommentChecker.py
Class Name: CommentChecker
Method Name: get_bad_comment


Project Name: gyoisamurai/GyoiThon
Commit Name: f776b5bd09425a7a15a0bf9e3bea8d995e70281e
Time: 2019-05-21
Author: gyoiler3@gmail.com
File Name: modules/Gyoi_ErrorChecker.py
Class Name: ErrorChecker
Method Name: get_error_message


Project Name: gyoisamurai/GyoiThon
Commit Name: f776b5bd09425a7a15a0bf9e3bea8d995e70281e
Time: 2019-05-21
Author: gyoiler3@gmail.com
File Name: modules/Gyoi_ContentExplorer.py
Class Name: ContentExplorer
Method Name: content_explorer