Recent

Author Topic: Extracting data from a website  (Read 3053 times)

madref

  • Hero Member
  • *****
  • Posts: 954
  • ..... A day not Laughed is a day wasted !!
    • Nursing With Humour
Extracting data from a website
« on: March 23, 2019, 10:35:19 am »
Take a look at this game sheet https://www.nijb.nl/nijbsheet.php?GameID=55668&ShowGameSheet=1 and especially the bottom part where the goals and penalties per period are scored.


Is it possible to extract that piece of data from this website.
And if yes then how?
You treat a disease, you win, you lose.
You treat a person and I guarantee you, you win, no matter the outcome.

Lazarus 3.99 (rev main_3_99-649-ge13451a5ab) FPC 3.3.1 x86_64-darwin-cocoa
Mac OS X Monterey

wp

  • Hero Member
  • *****
  • Posts: 11923
Re: Extracting data from a website
« Reply #1 on: March 23, 2019, 02:04:14 pm »
Use the fasthtmlparser unit which comes with fpc: https://forum.lazarus.freepascal.org/index.php/topic,43090.msg301176.html#msg301176.

This technique for extraction of table data from html is applied in the unit fpsHTML of the fpspreadsheet package. The unit is not self-contained but you can get the idea how it works.

madref

  • Hero Member
  • *****
  • Posts: 954
  • ..... A day not Laughed is a day wasted !!
    • Nursing With Humour
Re: Extracting data from a website
« Reply #2 on: March 24, 2019, 08:24:25 am »
when I try to run the program I get an error in line 88: Illeal qualifier.


Code: Pascal  [Select][+][-]
  1. program project1;
  2.  
  3.  
  4. {$mode objfpc}{$H+}
  5.  
  6.  
  7. uses
  8.   classes, sysutils, fphttpclient, fasthtmlparser;
  9.  
  10.  
  11. type
  12.   THtmlTextExtractor = class
  13.   private
  14.     FTempStream: TStream;
  15.     FIgnore: Boolean;
  16.     function CleanWhiteSpace(AText: String): String;
  17.     function FixHtmlEntities(AText: String): String;
  18.     procedure TagFoundHandler(NoCaseTag, ActualTag: string);
  19.     procedure TextFoundHandler(AText: String);
  20.   public
  21.     function ExtractFromHtml(AHtml: String): String;
  22.   end;
  23.  
  24.  
  25. function THtmlTextExtractor.CleanWhiteSpace(AText: String): String;
  26. begin
  27.   if (AText <> '') and (AText[1] = #10) then
  28.     while (AText <> '') and (AText[1] in [#10, ' ', #9]) do Delete(AText, 1, 1);
  29.   Result := AText;
  30. end;
  31.  
  32.  
  33. function THtmlTextExtractor.FixHtmlEntities(AText: String): String;
  34. var
  35.   P, PEnd: PChar;
  36.   s: String;
  37. begin
  38.   Result := '';
  39.   P := @AText[1];
  40.   PEnd := P + Length(AText);
  41.   while P < PEnd do begin
  42.     if P^ = '&' then
  43.     begin
  44.       s := '';
  45.       inc(P);
  46.       while (P <= PEnd) and (P^ <> ';') do begin
  47.         s := s + P^;
  48.         inc(P);
  49.       end;
  50.       case s of
  51.         'auml' : Result := Result + 'ä';
  52.         'Auml' : Result := Result + 'Ä';
  53.         'uuml' : Result := Result + 'ü';
  54.         'Uuml' : Result := Result + 'Ü';
  55.         'ouml' : Result := Result + 'ö';
  56.         'Ouml' : Result := Result + 'Ö';
  57.         'szlig': Result := Result + 'ß';
  58.         'nbsp' : Result := Result + ' ';
  59.         'lt'   : Result := Result + '<';
  60.         'gt'   : Result := Result + '>';
  61.         'amp'  : Result := Result + '&';
  62.         // ... add more...
  63.       end;
  64.     end else
  65.       Result := Result + P^;
  66.     inc(P);
  67.   end;
  68.   s := Result;
  69. end;
  70.  
  71.  
  72. procedure THtmlTextExtractor.TagFoundHandler(NoCaseTag, ActualTag: string);
  73. begin
  74.   // Use the FIgnore flag to skip some tags not needed
  75.   if (Pos('<HTML', NoCasetag) = 1) or
  76.      (NoCaseTag = '</SCRIPT>') or
  77.      (NoCaseTag = '</BUTTON>')
  78.   then
  79.     FIgnore := false
  80.   else
  81.   if (Pos('<SCRIPT', NoCaseTag) = 1) or
  82.      (Pos('<BUTTON', NoCaseTag) = 1) or
  83.      (NoCaseTag = '</HTML>')
  84.   then
  85.     FIgnore := true;
  86.  
  87.  
  88.   if FIgnore then
  89.     exit;
  90.  
  91.  
  92.   // Write a line-break after these tags
  93.   if (NoCasetag = '<BR>') or (NoCaseTag = '<BR />') or (NoCaseTag = '<BR/>') or
  94.      (NoCaseTag = '</P>') or (NoCaseTag = '</DIV>') or (NoCaseTag = '</TR>')
  95.   then
  96.     FTempStream.Write(LineEnding[1], Sizeof(LineEnding));
  97. end;
  98.  
  99.  
  100. procedure THtmlTextExtractor.TextFoundHandler(AText: String);
  101. var
  102.   s: String;
  103. begin
  104.   if FIgnore then
  105.     exit;
  106.   s := CleanWhiteSpace(AText);
  107.   if s = '' then
  108.     exit;
  109.   s := FixHtmlEntities(s);
  110.   FTempStream.Write(s[1], Length(s));
  111. end;
  112.  
  113.  
  114. function THtmlTextExtractor.ExtractFromHtml(AHtml: String): String;
  115. var
  116.   parser: THtmlParser;
  117. begin
  118.   if AHtml = '' then
  119.     exit ('');
  120.  
  121.  
  122.   parser := THtmlParser.Create(AHtml);
  123.   FTempStream := TMemoryStream.Create;
  124.   try
  125.     parser.OnFoundTag := @TagFoundHandler;
  126.     parser.OnFoundText := @TextFoundHandler;
  127.     parser.Exec;
  128.     FTempStream.Position := 0;
  129.     SetLength(Result, FTempStream.Size);
  130.     FTempStream.Read(Result[1], FTempStream.Size);
  131.   finally
  132.     FTempStream.Free;
  133.     parser.Free;
  134.   end;
  135. end;
  136.  
  137.  
  138. procedure SaveStringToFile(AText, AFileName: String);
  139. var
  140.   F: TextFile;
  141. begin
  142.   AssignFile(F, AFileName);
  143.   Rewrite(F);
  144.   WriteLn(F, AText);
  145.   CloseFile(F);
  146. end;
  147.  
  148.  
  149. var
  150.   s: String;
  151.   extractor: THtmlTextExtractor;
  152. begin
  153.   s := TFPHTTPClient.SimpleGet('https://trainingslager.onlineliga.de/#url=/player/overview?playerId=28056');
  154.   if s <> '' then begin
  155.     SaveStringToFile(s, 'text.html');
  156.     extractor := THTMLTextExtractor.Create;
  157.     try
  158.       s := extractor.ExtractFromHtml(s);
  159.       SaveStringToFile(s, 'test.txt');
  160.     finally
  161.       extractor.Free;
  162.     end;
  163.   end;
  164. end.


Whats wrong with it?
You treat a disease, you win, you lose.
You treat a person and I guarantee you, you win, no matter the outcome.

Lazarus 3.99 (rev main_3_99-649-ge13451a5ab) FPC 3.3.1 x86_64-darwin-cocoa
Mac OS X Monterey

440bx

  • Hero Member
  • *****
  • Posts: 4070
Re: Extracting data from a website
« Reply #3 on: March 24, 2019, 09:11:14 am »
when I try to run the program I get an error in line 88: Illeal qualifier.
Whats wrong with it?
Do you get the error when you try to _run_ the program or when you try to _compile_ it ?

For the record, your program - as you posted it - compiles just fine on my installation (FPC v3.0.4)
(FPC v3.0.4 and Lazarus 1.8.2) or (FPC v3.2.2 and Lazarus v3.2) on Windows 7 SP1 64bit.

madref

  • Hero Member
  • *****
  • Posts: 954
  • ..... A day not Laughed is a day wasted !!
    • Nursing With Humour
Re: Extracting data from a website
« Reply #4 on: March 24, 2019, 09:18:30 am »
while compiling on Lazarus 2.0.0 and fps 3.0.4 on OSx Mojave
You treat a disease, you win, you lose.
You treat a person and I guarantee you, you win, no matter the outcome.

Lazarus 3.99 (rev main_3_99-649-ge13451a5ab) FPC 3.3.1 x86_64-darwin-cocoa
Mac OS X Monterey

440bx

  • Hero Member
  • *****
  • Posts: 4070
Re: Extracting data from a website
« Reply #5 on: March 24, 2019, 09:35:08 am »
while compiling on Lazarus 2.0.0 and fps 3.0.4 on OSx Mojave
I don't run into any problems compiling it with FPC v3.0.4 using Lazarus v1.8.2 on Win 7. 

It looks like the problem is related to something in your installation/configuration.  Hopefully someone with a similar installation as yours can shed some light on the problem.

(FPC v3.0.4 and Lazarus 1.8.2) or (FPC v3.2.2 and Lazarus v3.2) on Windows 7 SP1 64bit.

howardpc

  • Hero Member
  • *****
  • Posts: 4144
Re: Extracting data from a website
« Reply #6 on: March 24, 2019, 09:59:23 am »
The problem arises from the different definition of LineEnding between Windows/Unixes.
You need to alter your TagFOundHandler routine to something like the following (I've simplified your logic slightly, because there is no point testing for the case that FIgnore is false):
Code: Pascal  [Select][+][-]
  1.   procedure THtmlTextExtractor.TagFoundHandler(NoCaseTag, ActualTag: String);
  2.   var
  3.     c: Char;
  4.   begin
  5.     // Use the FIgnore flag to skip some tags not needed
  6.     FIgnore := (Pos('<SCRIPT', NoCaseTag) = 1) or
  7.                (Pos('<BUTTON', NoCaseTag) = 1) or
  8.                (NoCaseTag = '</HTML>');
  9.     if FIgnore then
  10.       Exit;
  11.  
  12.     c := {$IfDef UNIX}LineEnding{$Else}LineEnding[1]{$IfEnd};
  13.     // Write a line-break after these tags
  14.     if (NoCasetag = '<BR>') or (NoCaseTag = '<BR />') or (NoCaseTag = '<BR/>') or
  15.        (NoCaseTag = '</P>') or (NoCaseTag = '</DIV>') or (NoCaseTag = '</TR>')
  16.     then
  17.       FTempStream.Write(c, Sizeof(LineEnding));
  18.   end;

Actually, if you just want to write a #10, it would be better to set c to #10 directly.
« Last Edit: March 24, 2019, 10:01:19 am by howardpc »

madref

  • Hero Member
  • *****
  • Posts: 954
  • ..... A day not Laughed is a day wasted !!
    • Nursing With Humour
Re: Extracting data from a website
« Reply #7 on: March 24, 2019, 10:28:28 am »
Thanx Howard, now it works.


P.S. it's not code that I have written. It's WP's (see second post)
You treat a disease, you win, you lose.
You treat a person and I guarantee you, you win, no matter the outcome.

Lazarus 3.99 (rev main_3_99-649-ge13451a5ab) FPC 3.3.1 x86_64-darwin-cocoa
Mac OS X Monterey

 

TinyPortal © 2005-2018