#!/usr/bin/perl
# Parse Issues MBOX file for:
# Message, Subject, Date, Link
# Deid 2022-04-03

### Main 
$filename = "$ARGV[0]";

open(INP, '<', $filename) or die $!;

# Set state for the item we are looking for.
$state = "start";

while ($in = <INP>) {
  chomp $in;
  # Get the start of the email.
  if ($in =~ /From deid/ and $state eq "start") {
#    print "$in\n" ;
    $state = "subject";
    next;
  }
  # Get the Subject
  if ($in =~ /Subject: / and $state eq "subject") {
    $in = translate($in);
    $subject = $in;
    $state = "date";
    next;
  }
  # Get the Date
  if ($in =~ /Date: / and $state eq "date") {
    $date = $in;
    $state = "http";
    next;
  }
  # Get the http link
  if ($in =~ /https:\/\// and $state eq "http") {
    $link = $in;
    $link = translate($link);
    $state = "start";
    chomp $subject;
    chomp $date;
    chomp $link;
    print "$subject||$date||$link\n";
    next;

  }
}

sub translate {
  # Unicode Ascii representation in email
  @from = ('=3A', '=2E', '=25', '=24','=27','=22', '=3F', '=2C', '=26', '=7C', '=28', '=29',
              '=E2=80=98', '=E2=80=99', '=E2=80=93', '=E2=80=94',
              '=C2=A0', '=C3=AB', '=C3=A7', '=F0=9F=92=A9', '=F0=9F=93=9A', '=3D', '=E2=80=9C', '=E2=80=9d',
              '=E2=80=A6' );
  # ascii equivalent - or my opinion of ascii equivalent              
  @to = (':', '.', '%', '$','\'', '"', '?', ',', '&', '|', '(',')', '\'', '\'', '-', '-', ' ', 'e', 'c', ':(',':','=', '"', '"', '...');

  # Length of the translate vectors
  $length = @from;
  $lengthto = @to;
  die if ($length - $lengthto != 0);
#  print "[length]: $length\n";
#  print "[to length]: $lengthto\n";
  $in = $_[0];
  $link = $in;
  $link =~ s/=\?UTF\-8\?Q\?//ig;
  $link =~ s/\?=$/=/ig;
  $link =~ s/_/ /g;
  # Link may have multiple lines.
  while ($link =~ /=$/) {
    chop $link;
    $in = <INP>;
    next if ($in =~ /From:/);
    chomp $in;
    $in =~ s/=\?UTF\-8\?Q\?//ig;
    $in =~ s/\?=$/=/ig;
    $in =~ s/_/ /g;
    $in =~ s/\t//;
    $link = $link . $in;
  }
  # Translate the unicode to ascii
  for ($i = 0; $i < $length; $i++) {
    $link =~ s/$from[$i]/$to[$i]/ig;
  }
  $link =~ s/'/''/g;

  return $link."\n";
}